diff --git a/src/core/operator-dashboard.mjs b/src/core/operator-dashboard.mjs index 04ceb7e..ad3721d 100644 --- a/src/core/operator-dashboard.mjs +++ b/src/core/operator-dashboard.mjs @@ -816,6 +816,7 @@ function summarizeServiceSnapshot(snapshot, { authoritativeHealth = null, active reachable: snapshot.reachable, health_ok: derived.health_ok, health_status: derived.status, + health_label: derived.label || derived.status, health_reasons: derived.reasons || [], highest_alert_severity: derived.highest_alert_severity || null, paused: derived.paused ?? state.paused ?? health.paused ?? null, diff --git a/src/core/runtime-health.mjs b/src/core/runtime-health.mjs index 4fdc475..dcb584e 100644 --- a/src/core/runtime-health.mjs +++ b/src/core/runtime-health.mjs @@ -69,50 +69,71 @@ export function deriveServiceHealth({ const freshnessAgeMs = ageMs(freshnessAt, now); const reasons = []; let status = paused ? 'paused' : reachable ? 'healthy' : 'offline'; + let label = status; if (!reachable) { reasons.push('service unreachable'); - } - - if (health.ok === false && reachable) { - status = escalateHealth(status, 'critical'); - reasons.push(health.reason || 'service health check failed'); - } - - if (highestAlertSeverity === 'critical') { - status = escalateHealth(status, 'critical'); - reasons.push(`critical alert active (${activeAlerts[0]?.alert_code || 'runtime'})`); - } else if (highestAlertSeverity === 'warning') { - status = escalateHealth(status, 'warning'); - reasons.push(`warning alert active (${activeAlerts[0]?.alert_code || 'runtime'})`); + label = 'offline'; } if (service === 'near-intents-ingest') { - if (state.ingest?.connected === false) { + const ingestClassification = classifyNearIntentsIngestHealth({ + state, + health, + activeAlerts, + reachable, + now, + }); + status = escalateHealth(status, ingestClassification.status); + label = ingestClassification.label; + reasons.push(...ingestClassification.reasons); + } else { + if (health.ok === false && reachable) { status = escalateHealth(status, 'critical'); - reasons.push('websocket disconnected'); + reasons.push(health.reason || 'service health check failed'); } - if (state.ingest?.last_matching_quote_at && state.ingest?.last_published_at) { - const matchingAgeMs = ageMs(state.ingest.last_matching_quote_at, now); - const publishedAgeMs = ageMs(state.ingest.last_published_at, now); - if (matchingAgeMs != null && publishedAgeMs != null && publishedAgeMs > matchingAgeMs + 5_000) { - status = escalateHealth(status, 'critical'); - reasons.push('quote publish path stalled'); - } + + if (highestAlertSeverity === 'critical') { + status = escalateHealth(status, 'critical'); + reasons.push(`critical alert active (${activeAlerts[0]?.alert_code || 'runtime'})`); + } else if (highestAlertSeverity === 'warning') { + status = escalateHealth(status, 'warning'); + reasons.push(`warning alert active (${activeAlerts[0]?.alert_code || 'runtime'})`); } } + if (service !== 'near-intents-ingest' && status === 'healthy') { + label = 'healthy'; + } + if (service !== 'near-intents-ingest' && status === 'paused') { + label = 'paused'; + } + if (service !== 'near-intents-ingest' && status === 'critical' && label === 'critical') { + label = 'critical'; + } + if (service !== 'near-intents-ingest' && status === 'warning' && label === 'warning') { + label = 'warning'; + } + if (service !== 'near-intents-ingest' && status === 'offline') { + label = 'offline'; + } + if (service === 'trade-executor' && state.relay?.connected === false) { status = escalateHealth(status, 'critical'); - reasons.push('solver relay disconnected'); + label = 'relay disconnected'; + if (!reasons.includes('solver relay disconnected')) { + reasons.push('solver relay disconnected'); + } } if (service === 'history-writer') { if (state.database_connectivity === false) { status = escalateHealth(status, 'critical'); + label = 'database disconnected'; reasons.push('database connectivity failed'); } else if (freshnessAgeMs != null && freshnessAgeMs > 45_000) { status = escalateHealth(status, 'warning'); + label = 'writer lagging'; reasons.push('writer freshness degraded'); } } @@ -120,6 +141,7 @@ export function deriveServiceHealth({ if (service === 'operator-dashboard') { if ((state.source_error_count || 0) > 0 || (health.source_error_count || 0) > 0) { status = escalateHealth(status, 'warning'); + label = 'sources degraded'; reasons.push('dashboard source degraded'); } } @@ -130,23 +152,96 @@ export function deriveServiceHealth({ && hasCriticalTruthAlert(activeAlerts, activePair) ) { status = escalateHealth(status, 'critical'); + if (label === 'healthy' || label === 'warning') { + label = 'armed on stale truth'; + } reasons.push('armed while critical upstream truth is stale'); } return { service, status, + label, reachable, paused, armed: state.armed ?? null, health_ok: status === 'healthy' || status === 'paused', highest_alert_severity: highestAlertSeverity, - reasons, + reasons: dedupeReasons(reasons), freshness_at: freshnessAt, freshness_age_ms: freshnessAgeMs, }; } +function classifyNearIntentsIngestHealth({ + state, + health, + activeAlerts, + reachable, + now, +}) { + const reasons = []; + const alertCodes = new Set((activeAlerts || []).map((alert) => alert.alert_code)); + const connected = state.ingest?.connected ?? health.connected ?? null; + const matchingAgeMs = ageMs(state.ingest?.last_matching_quote_at, now); + const publishedAgeMs = ageMs(state.ingest?.last_published_at, now); + + if (!reachable) { + return { + status: 'offline', + label: 'offline', + reasons: ['service unreachable'], + }; + } + + if (connected === false || alertCodes.has('near_intents_ingest_disconnected')) { + return { + status: 'critical', + label: 'disconnected', + reasons: ['websocket disconnected', 'critical alert active (near_intents_ingest_disconnected)'], + }; + } + + if ( + alertCodes.has('near_intents_publish_stale') + || ( + state.ingest?.last_matching_quote_at + && state.ingest?.last_published_at + && matchingAgeMs != null + && publishedAgeMs != null + && publishedAgeMs > matchingAgeMs + 5_000 + ) + ) { + return { + status: 'critical', + label: 'publish stalled', + reasons: ['quote publish path stalled', 'critical alert active (near_intents_publish_stale)'], + }; + } + + if (alertCodes.has('near_intents_quotes_stale') || health.reason === 'quote truth stale') { + reasons.push('connected, no recent quotes for active pair'); + if (matchingAgeMs != null) { + reasons.push(`last matching quote ${matchingAgeMs}ms ago`); + } + return { + status: 'warning', + label: 'no recent quotes', + reasons, + }; + } + + return { + status: health.ok === false ? 'warning' : 'healthy', + label: connected === true ? 'healthy' : 'unknown', + reasons: health.ok === false ? [health.reason || 'service health degraded'] : [], + }; +} + +function dedupeReasons(reasons) { + return [...new Set((reasons || []).filter(Boolean))]; +} + export function inferServiceFreshnessTimestamp(service, state = {}, health = {}) { switch (service) { case 'near-intents-ingest': diff --git a/src/operator-dashboard/static/components/ServiceCard.jsx b/src/operator-dashboard/static/components/ServiceCard.jsx index 9bd9bc9..93c9f07 100644 --- a/src/operator-dashboard/static/components/ServiceCard.jsx +++ b/src/operator-dashboard/static/components/ServiceCard.jsx @@ -2,7 +2,7 @@ import Pill from './Pill.jsx'; import { formatAge, formatBoolean } from '../lib/format.js'; export default function ServiceCard({ service }) { - const healthLabel = service.health_status || (service.health_ok ? 'healthy' : service.reachable ? 'degraded' : 'offline'); + const healthLabel = service.health_label || service.health_status || (service.health_ok ? 'healthy' : service.reachable ? 'degraded' : 'offline'); return (