Refine ingest dashboard health semantics
All checks were successful
deploy / deploy (push) Successful in 36s

Proof: Connected ingest without recent quotes now renders as a warning-level no-recent-quotes state instead of a generic critical service failure, while disconnected and publish-stalled cases remain critical.

Assumptions: Operators need transport failure and quote-quiet conditions separated on the service card; stale quote truth should still block trust without pretending the websocket is broken.

Still fake: External alert delivery remains unconfigured; live alert routing is still through the generic webhook path only when configured.
This commit is contained in:
philipp 2026-04-08 20:36:11 +02:00
parent 903287ec21
commit 69be378784
4 changed files with 214 additions and 29 deletions

View file

@ -816,6 +816,7 @@ function summarizeServiceSnapshot(snapshot, { authoritativeHealth = null, active
reachable: snapshot.reachable,
health_ok: derived.health_ok,
health_status: derived.status,
health_label: derived.label || derived.status,
health_reasons: derived.reasons || [],
highest_alert_severity: derived.highest_alert_severity || null,
paused: derived.paused ?? state.paused ?? health.paused ?? null,

View file

@ -69,11 +69,25 @@ export function deriveServiceHealth({
const freshnessAgeMs = ageMs(freshnessAt, now);
const reasons = [];
let status = paused ? 'paused' : reachable ? 'healthy' : 'offline';
let label = status;
if (!reachable) {
reasons.push('service unreachable');
label = 'offline';
}
if (service === 'near-intents-ingest') {
const ingestClassification = classifyNearIntentsIngestHealth({
state,
health,
activeAlerts,
reachable,
now,
});
status = escalateHealth(status, ingestClassification.status);
label = ingestClassification.label;
reasons.push(...ingestClassification.reasons);
} else {
if (health.ok === false && reachable) {
status = escalateHealth(status, 'critical');
reasons.push(health.reason || 'service health check failed');
@ -86,33 +100,40 @@ export function deriveServiceHealth({
status = escalateHealth(status, 'warning');
reasons.push(`warning alert active (${activeAlerts[0]?.alert_code || 'runtime'})`);
}
}
if (service === 'near-intents-ingest') {
if (state.ingest?.connected === false) {
status = escalateHealth(status, 'critical');
reasons.push('websocket disconnected');
if (service !== 'near-intents-ingest' && status === 'healthy') {
label = 'healthy';
}
if (state.ingest?.last_matching_quote_at && state.ingest?.last_published_at) {
const matchingAgeMs = ageMs(state.ingest.last_matching_quote_at, now);
const publishedAgeMs = ageMs(state.ingest.last_published_at, now);
if (matchingAgeMs != null && publishedAgeMs != null && publishedAgeMs > matchingAgeMs + 5_000) {
status = escalateHealth(status, 'critical');
reasons.push('quote publish path stalled');
if (service !== 'near-intents-ingest' && status === 'paused') {
label = 'paused';
}
if (service !== 'near-intents-ingest' && status === 'critical' && label === 'critical') {
label = 'critical';
}
if (service !== 'near-intents-ingest' && status === 'warning' && label === 'warning') {
label = 'warning';
}
if (service !== 'near-intents-ingest' && status === 'offline') {
label = 'offline';
}
if (service === 'trade-executor' && state.relay?.connected === false) {
status = escalateHealth(status, 'critical');
label = 'relay disconnected';
if (!reasons.includes('solver relay disconnected')) {
reasons.push('solver relay disconnected');
}
}
if (service === 'history-writer') {
if (state.database_connectivity === false) {
status = escalateHealth(status, 'critical');
label = 'database disconnected';
reasons.push('database connectivity failed');
} else if (freshnessAgeMs != null && freshnessAgeMs > 45_000) {
status = escalateHealth(status, 'warning');
label = 'writer lagging';
reasons.push('writer freshness degraded');
}
}
@ -120,6 +141,7 @@ export function deriveServiceHealth({
if (service === 'operator-dashboard') {
if ((state.source_error_count || 0) > 0 || (health.source_error_count || 0) > 0) {
status = escalateHealth(status, 'warning');
label = 'sources degraded';
reasons.push('dashboard source degraded');
}
}
@ -130,23 +152,96 @@ export function deriveServiceHealth({
&& hasCriticalTruthAlert(activeAlerts, activePair)
) {
status = escalateHealth(status, 'critical');
if (label === 'healthy' || label === 'warning') {
label = 'armed on stale truth';
}
reasons.push('armed while critical upstream truth is stale');
}
return {
service,
status,
label,
reachable,
paused,
armed: state.armed ?? null,
health_ok: status === 'healthy' || status === 'paused',
highest_alert_severity: highestAlertSeverity,
reasons,
reasons: dedupeReasons(reasons),
freshness_at: freshnessAt,
freshness_age_ms: freshnessAgeMs,
};
}
function classifyNearIntentsIngestHealth({
state,
health,
activeAlerts,
reachable,
now,
}) {
const reasons = [];
const alertCodes = new Set((activeAlerts || []).map((alert) => alert.alert_code));
const connected = state.ingest?.connected ?? health.connected ?? null;
const matchingAgeMs = ageMs(state.ingest?.last_matching_quote_at, now);
const publishedAgeMs = ageMs(state.ingest?.last_published_at, now);
if (!reachable) {
return {
status: 'offline',
label: 'offline',
reasons: ['service unreachable'],
};
}
if (connected === false || alertCodes.has('near_intents_ingest_disconnected')) {
return {
status: 'critical',
label: 'disconnected',
reasons: ['websocket disconnected', 'critical alert active (near_intents_ingest_disconnected)'],
};
}
if (
alertCodes.has('near_intents_publish_stale')
|| (
state.ingest?.last_matching_quote_at
&& state.ingest?.last_published_at
&& matchingAgeMs != null
&& publishedAgeMs != null
&& publishedAgeMs > matchingAgeMs + 5_000
)
) {
return {
status: 'critical',
label: 'publish stalled',
reasons: ['quote publish path stalled', 'critical alert active (near_intents_publish_stale)'],
};
}
if (alertCodes.has('near_intents_quotes_stale') || health.reason === 'quote truth stale') {
reasons.push('connected, no recent quotes for active pair');
if (matchingAgeMs != null) {
reasons.push(`last matching quote ${matchingAgeMs}ms ago`);
}
return {
status: 'warning',
label: 'no recent quotes',
reasons,
};
}
return {
status: health.ok === false ? 'warning' : 'healthy',
label: connected === true ? 'healthy' : 'unknown',
reasons: health.ok === false ? [health.reason || 'service health degraded'] : [],
};
}
function dedupeReasons(reasons) {
return [...new Set((reasons || []).filter(Boolean))];
}
export function inferServiceFreshnessTimestamp(service, state = {}, health = {}) {
switch (service) {
case 'near-intents-ingest':

View file

@ -2,7 +2,7 @@ import Pill from './Pill.jsx';
import { formatAge, formatBoolean } from '../lib/format.js';
export default function ServiceCard({ service }) {
const healthLabel = service.health_status || (service.health_ok ? 'healthy' : service.reachable ? 'degraded' : 'offline');
const healthLabel = service.health_label || service.health_status || (service.health_ok ? 'healthy' : service.reachable ? 'degraded' : 'offline');
return (
<div className="service-card">

View file

@ -446,13 +446,14 @@ test('system service health uses sentinel-derived severity so stale ingest is ne
recent_transitions: [],
service_health: [{
service: 'near-intents-ingest',
status: 'critical',
status: 'warning',
label: 'no recent quotes',
reachable: true,
paused: false,
armed: null,
health_ok: false,
highest_alert_severity: 'critical',
reasons: ['critical alert active (near_intents_publish_stale)'],
reasons: ['connected, no recent quotes for active pair'],
freshness_at: '2026-04-03T02:12:00.000Z',
freshness_age_ms: 110_880_000,
}],
@ -463,11 +464,99 @@ test('system service health uses sentinel-derived severity so stale ingest is ne
const ingest = bootstrap.system.service_health.find((service) => service.service === 'near-intents-ingest');
assert.equal(ingest.health_ok, false);
assert.equal(ingest.health_status, 'critical');
assert.match(ingest.health_reasons.join(' '), /critical alert active/);
assert.equal(ingest.health_status, 'warning');
assert.equal(ingest.health_label, 'no recent quotes');
assert.match(ingest.health_reasons.join(' '), /connected, no recent quotes/);
assert.equal(bootstrap.status_bar.highest_alert_severity, 'critical');
});
test('ingest disconnected still renders as a critical transport failure', () => {
const config = buildConfig();
const bootstrap = buildDashboardBootstrap({
config,
auth: {
authenticated: true,
subject: 'local-operator',
mode: 'stub',
roles: ['operator'],
},
portfolioMetric: null,
inventorySnapshot: null,
marketPrice: null,
recentQuotes: [],
successfulTrades: {
page: 1,
page_size: 20,
total: 0,
total_pages: 1,
items: [],
},
successfulTradeSummary: {
total: 0,
last_successful_trade_at: null,
},
fundingObservations: [],
recentTradeDecisions: [],
recentAlertTransitions: [],
serviceSnapshots: [
{
service: 'near-intents-ingest',
label: 'Intents Ingest',
base_url: 'http://near-intents-ingest',
reachable: true,
health: { ok: false, connected: false, reason: 'websocket disconnected' },
state: {
ingest: {
connected: false,
last_message_at: '2026-04-04T09:00:00.000Z',
},
},
},
{
service: 'ops-sentinel',
label: 'Ops Sentinel',
base_url: 'http://ops-sentinel',
reachable: true,
health: { ok: true },
state: {
active_alerts: [{
alert_code: 'near_intents_ingest_disconnected',
status: 'raised',
severity: 'critical',
reason: 'websocket disconnected',
service_scope: 'near-intents-ingest',
pair: config.activePair,
raised_at: '2026-04-04T09:30:00.000Z',
first_raised_at: '2026-04-04T09:30:00.000Z',
cleared_at: null,
last_evaluated_at: '2026-04-04T09:30:00.000Z',
details: {},
}],
recent_transitions: [],
service_health: [{
service: 'near-intents-ingest',
status: 'critical',
label: 'disconnected',
reachable: true,
paused: false,
armed: null,
health_ok: false,
highest_alert_severity: 'critical',
reasons: ['websocket disconnected', 'critical alert active (near_intents_ingest_disconnected)'],
freshness_at: '2026-04-04T09:00:00.000Z',
freshness_age_ms: 60_000,
}],
},
},
],
});
const ingest = bootstrap.system.service_health.find((service) => service.service === 'near-intents-ingest');
assert.equal(ingest.health_status, 'critical');
assert.equal(ingest.health_label, 'disconnected');
assert.match(ingest.health_reasons.join(' '), /websocket disconnected/);
});
test('funding summary includes credited bridge deposits without observer-backed funding observations', () => {
const config = buildConfig();
const bootstrap = buildDashboardBootstrap({