Refine ingest dashboard health semantics
All checks were successful
deploy / deploy (push) Successful in 36s
All checks were successful
deploy / deploy (push) Successful in 36s
Proof: Connected ingest without recent quotes now renders as a warning-level no-recent-quotes state instead of a generic critical service failure, while disconnected and publish-stalled cases remain critical. Assumptions: Operators need transport failure and quote-quiet conditions separated on the service card; stale quote truth should still block trust without pretending the websocket is broken. Still fake: External alert delivery remains unconfigured; live alert routing is still through the generic webhook path only when configured.
This commit is contained in:
parent
903287ec21
commit
69be378784
4 changed files with 214 additions and 29 deletions
|
|
@ -816,6 +816,7 @@ function summarizeServiceSnapshot(snapshot, { authoritativeHealth = null, active
|
|||
reachable: snapshot.reachable,
|
||||
health_ok: derived.health_ok,
|
||||
health_status: derived.status,
|
||||
health_label: derived.label || derived.status,
|
||||
health_reasons: derived.reasons || [],
|
||||
highest_alert_severity: derived.highest_alert_severity || null,
|
||||
paused: derived.paused ?? state.paused ?? health.paused ?? null,
|
||||
|
|
|
|||
|
|
@ -69,11 +69,25 @@ export function deriveServiceHealth({
|
|||
const freshnessAgeMs = ageMs(freshnessAt, now);
|
||||
const reasons = [];
|
||||
let status = paused ? 'paused' : reachable ? 'healthy' : 'offline';
|
||||
let label = status;
|
||||
|
||||
if (!reachable) {
|
||||
reasons.push('service unreachable');
|
||||
label = 'offline';
|
||||
}
|
||||
|
||||
if (service === 'near-intents-ingest') {
|
||||
const ingestClassification = classifyNearIntentsIngestHealth({
|
||||
state,
|
||||
health,
|
||||
activeAlerts,
|
||||
reachable,
|
||||
now,
|
||||
});
|
||||
status = escalateHealth(status, ingestClassification.status);
|
||||
label = ingestClassification.label;
|
||||
reasons.push(...ingestClassification.reasons);
|
||||
} else {
|
||||
if (health.ok === false && reachable) {
|
||||
status = escalateHealth(status, 'critical');
|
||||
reasons.push(health.reason || 'service health check failed');
|
||||
|
|
@ -86,33 +100,40 @@ export function deriveServiceHealth({
|
|||
status = escalateHealth(status, 'warning');
|
||||
reasons.push(`warning alert active (${activeAlerts[0]?.alert_code || 'runtime'})`);
|
||||
}
|
||||
}
|
||||
|
||||
if (service === 'near-intents-ingest') {
|
||||
if (state.ingest?.connected === false) {
|
||||
status = escalateHealth(status, 'critical');
|
||||
reasons.push('websocket disconnected');
|
||||
if (service !== 'near-intents-ingest' && status === 'healthy') {
|
||||
label = 'healthy';
|
||||
}
|
||||
if (state.ingest?.last_matching_quote_at && state.ingest?.last_published_at) {
|
||||
const matchingAgeMs = ageMs(state.ingest.last_matching_quote_at, now);
|
||||
const publishedAgeMs = ageMs(state.ingest.last_published_at, now);
|
||||
if (matchingAgeMs != null && publishedAgeMs != null && publishedAgeMs > matchingAgeMs + 5_000) {
|
||||
status = escalateHealth(status, 'critical');
|
||||
reasons.push('quote publish path stalled');
|
||||
if (service !== 'near-intents-ingest' && status === 'paused') {
|
||||
label = 'paused';
|
||||
}
|
||||
if (service !== 'near-intents-ingest' && status === 'critical' && label === 'critical') {
|
||||
label = 'critical';
|
||||
}
|
||||
if (service !== 'near-intents-ingest' && status === 'warning' && label === 'warning') {
|
||||
label = 'warning';
|
||||
}
|
||||
if (service !== 'near-intents-ingest' && status === 'offline') {
|
||||
label = 'offline';
|
||||
}
|
||||
|
||||
if (service === 'trade-executor' && state.relay?.connected === false) {
|
||||
status = escalateHealth(status, 'critical');
|
||||
label = 'relay disconnected';
|
||||
if (!reasons.includes('solver relay disconnected')) {
|
||||
reasons.push('solver relay disconnected');
|
||||
}
|
||||
}
|
||||
|
||||
if (service === 'history-writer') {
|
||||
if (state.database_connectivity === false) {
|
||||
status = escalateHealth(status, 'critical');
|
||||
label = 'database disconnected';
|
||||
reasons.push('database connectivity failed');
|
||||
} else if (freshnessAgeMs != null && freshnessAgeMs > 45_000) {
|
||||
status = escalateHealth(status, 'warning');
|
||||
label = 'writer lagging';
|
||||
reasons.push('writer freshness degraded');
|
||||
}
|
||||
}
|
||||
|
|
@ -120,6 +141,7 @@ export function deriveServiceHealth({
|
|||
if (service === 'operator-dashboard') {
|
||||
if ((state.source_error_count || 0) > 0 || (health.source_error_count || 0) > 0) {
|
||||
status = escalateHealth(status, 'warning');
|
||||
label = 'sources degraded';
|
||||
reasons.push('dashboard source degraded');
|
||||
}
|
||||
}
|
||||
|
|
@ -130,23 +152,96 @@ export function deriveServiceHealth({
|
|||
&& hasCriticalTruthAlert(activeAlerts, activePair)
|
||||
) {
|
||||
status = escalateHealth(status, 'critical');
|
||||
if (label === 'healthy' || label === 'warning') {
|
||||
label = 'armed on stale truth';
|
||||
}
|
||||
reasons.push('armed while critical upstream truth is stale');
|
||||
}
|
||||
|
||||
return {
|
||||
service,
|
||||
status,
|
||||
label,
|
||||
reachable,
|
||||
paused,
|
||||
armed: state.armed ?? null,
|
||||
health_ok: status === 'healthy' || status === 'paused',
|
||||
highest_alert_severity: highestAlertSeverity,
|
||||
reasons,
|
||||
reasons: dedupeReasons(reasons),
|
||||
freshness_at: freshnessAt,
|
||||
freshness_age_ms: freshnessAgeMs,
|
||||
};
|
||||
}
|
||||
|
||||
function classifyNearIntentsIngestHealth({
|
||||
state,
|
||||
health,
|
||||
activeAlerts,
|
||||
reachable,
|
||||
now,
|
||||
}) {
|
||||
const reasons = [];
|
||||
const alertCodes = new Set((activeAlerts || []).map((alert) => alert.alert_code));
|
||||
const connected = state.ingest?.connected ?? health.connected ?? null;
|
||||
const matchingAgeMs = ageMs(state.ingest?.last_matching_quote_at, now);
|
||||
const publishedAgeMs = ageMs(state.ingest?.last_published_at, now);
|
||||
|
||||
if (!reachable) {
|
||||
return {
|
||||
status: 'offline',
|
||||
label: 'offline',
|
||||
reasons: ['service unreachable'],
|
||||
};
|
||||
}
|
||||
|
||||
if (connected === false || alertCodes.has('near_intents_ingest_disconnected')) {
|
||||
return {
|
||||
status: 'critical',
|
||||
label: 'disconnected',
|
||||
reasons: ['websocket disconnected', 'critical alert active (near_intents_ingest_disconnected)'],
|
||||
};
|
||||
}
|
||||
|
||||
if (
|
||||
alertCodes.has('near_intents_publish_stale')
|
||||
|| (
|
||||
state.ingest?.last_matching_quote_at
|
||||
&& state.ingest?.last_published_at
|
||||
&& matchingAgeMs != null
|
||||
&& publishedAgeMs != null
|
||||
&& publishedAgeMs > matchingAgeMs + 5_000
|
||||
)
|
||||
) {
|
||||
return {
|
||||
status: 'critical',
|
||||
label: 'publish stalled',
|
||||
reasons: ['quote publish path stalled', 'critical alert active (near_intents_publish_stale)'],
|
||||
};
|
||||
}
|
||||
|
||||
if (alertCodes.has('near_intents_quotes_stale') || health.reason === 'quote truth stale') {
|
||||
reasons.push('connected, no recent quotes for active pair');
|
||||
if (matchingAgeMs != null) {
|
||||
reasons.push(`last matching quote ${matchingAgeMs}ms ago`);
|
||||
}
|
||||
return {
|
||||
status: 'warning',
|
||||
label: 'no recent quotes',
|
||||
reasons,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
status: health.ok === false ? 'warning' : 'healthy',
|
||||
label: connected === true ? 'healthy' : 'unknown',
|
||||
reasons: health.ok === false ? [health.reason || 'service health degraded'] : [],
|
||||
};
|
||||
}
|
||||
|
||||
function dedupeReasons(reasons) {
|
||||
return [...new Set((reasons || []).filter(Boolean))];
|
||||
}
|
||||
|
||||
export function inferServiceFreshnessTimestamp(service, state = {}, health = {}) {
|
||||
switch (service) {
|
||||
case 'near-intents-ingest':
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import Pill from './Pill.jsx';
|
|||
import { formatAge, formatBoolean } from '../lib/format.js';
|
||||
|
||||
export default function ServiceCard({ service }) {
|
||||
const healthLabel = service.health_status || (service.health_ok ? 'healthy' : service.reachable ? 'degraded' : 'offline');
|
||||
const healthLabel = service.health_label || service.health_status || (service.health_ok ? 'healthy' : service.reachable ? 'degraded' : 'offline');
|
||||
|
||||
return (
|
||||
<div className="service-card">
|
||||
|
|
|
|||
|
|
@ -446,13 +446,14 @@ test('system service health uses sentinel-derived severity so stale ingest is ne
|
|||
recent_transitions: [],
|
||||
service_health: [{
|
||||
service: 'near-intents-ingest',
|
||||
status: 'critical',
|
||||
status: 'warning',
|
||||
label: 'no recent quotes',
|
||||
reachable: true,
|
||||
paused: false,
|
||||
armed: null,
|
||||
health_ok: false,
|
||||
highest_alert_severity: 'critical',
|
||||
reasons: ['critical alert active (near_intents_publish_stale)'],
|
||||
reasons: ['connected, no recent quotes for active pair'],
|
||||
freshness_at: '2026-04-03T02:12:00.000Z',
|
||||
freshness_age_ms: 110_880_000,
|
||||
}],
|
||||
|
|
@ -463,11 +464,99 @@ test('system service health uses sentinel-derived severity so stale ingest is ne
|
|||
|
||||
const ingest = bootstrap.system.service_health.find((service) => service.service === 'near-intents-ingest');
|
||||
assert.equal(ingest.health_ok, false);
|
||||
assert.equal(ingest.health_status, 'critical');
|
||||
assert.match(ingest.health_reasons.join(' '), /critical alert active/);
|
||||
assert.equal(ingest.health_status, 'warning');
|
||||
assert.equal(ingest.health_label, 'no recent quotes');
|
||||
assert.match(ingest.health_reasons.join(' '), /connected, no recent quotes/);
|
||||
assert.equal(bootstrap.status_bar.highest_alert_severity, 'critical');
|
||||
});
|
||||
|
||||
test('ingest disconnected still renders as a critical transport failure', () => {
|
||||
const config = buildConfig();
|
||||
const bootstrap = buildDashboardBootstrap({
|
||||
config,
|
||||
auth: {
|
||||
authenticated: true,
|
||||
subject: 'local-operator',
|
||||
mode: 'stub',
|
||||
roles: ['operator'],
|
||||
},
|
||||
portfolioMetric: null,
|
||||
inventorySnapshot: null,
|
||||
marketPrice: null,
|
||||
recentQuotes: [],
|
||||
successfulTrades: {
|
||||
page: 1,
|
||||
page_size: 20,
|
||||
total: 0,
|
||||
total_pages: 1,
|
||||
items: [],
|
||||
},
|
||||
successfulTradeSummary: {
|
||||
total: 0,
|
||||
last_successful_trade_at: null,
|
||||
},
|
||||
fundingObservations: [],
|
||||
recentTradeDecisions: [],
|
||||
recentAlertTransitions: [],
|
||||
serviceSnapshots: [
|
||||
{
|
||||
service: 'near-intents-ingest',
|
||||
label: 'Intents Ingest',
|
||||
base_url: 'http://near-intents-ingest',
|
||||
reachable: true,
|
||||
health: { ok: false, connected: false, reason: 'websocket disconnected' },
|
||||
state: {
|
||||
ingest: {
|
||||
connected: false,
|
||||
last_message_at: '2026-04-04T09:00:00.000Z',
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
service: 'ops-sentinel',
|
||||
label: 'Ops Sentinel',
|
||||
base_url: 'http://ops-sentinel',
|
||||
reachable: true,
|
||||
health: { ok: true },
|
||||
state: {
|
||||
active_alerts: [{
|
||||
alert_code: 'near_intents_ingest_disconnected',
|
||||
status: 'raised',
|
||||
severity: 'critical',
|
||||
reason: 'websocket disconnected',
|
||||
service_scope: 'near-intents-ingest',
|
||||
pair: config.activePair,
|
||||
raised_at: '2026-04-04T09:30:00.000Z',
|
||||
first_raised_at: '2026-04-04T09:30:00.000Z',
|
||||
cleared_at: null,
|
||||
last_evaluated_at: '2026-04-04T09:30:00.000Z',
|
||||
details: {},
|
||||
}],
|
||||
recent_transitions: [],
|
||||
service_health: [{
|
||||
service: 'near-intents-ingest',
|
||||
status: 'critical',
|
||||
label: 'disconnected',
|
||||
reachable: true,
|
||||
paused: false,
|
||||
armed: null,
|
||||
health_ok: false,
|
||||
highest_alert_severity: 'critical',
|
||||
reasons: ['websocket disconnected', 'critical alert active (near_intents_ingest_disconnected)'],
|
||||
freshness_at: '2026-04-04T09:00:00.000Z',
|
||||
freshness_age_ms: 60_000,
|
||||
}],
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const ingest = bootstrap.system.service_health.find((service) => service.service === 'near-intents-ingest');
|
||||
assert.equal(ingest.health_status, 'critical');
|
||||
assert.equal(ingest.health_label, 'disconnected');
|
||||
assert.match(ingest.health_reasons.join(' '), /websocket disconnected/);
|
||||
});
|
||||
|
||||
test('funding summary includes credited bridge deposits without observer-backed funding observations', () => {
|
||||
const config = buildConfig();
|
||||
const bootstrap = buildDashboardBootstrap({
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue