Refine ingest dashboard health semantics
All checks were successful
deploy / deploy (push) Successful in 36s
All checks were successful
deploy / deploy (push) Successful in 36s
Proof: Connected ingest without recent quotes now renders as a warning-level no-recent-quotes state instead of a generic critical service failure, while disconnected and publish-stalled cases remain critical. Assumptions: Operators need transport failure and quote-quiet conditions separated on the service card; stale quote truth should still block trust without pretending the websocket is broken. Still fake: External alert delivery remains unconfigured; live alert routing is still through the generic webhook path only when configured.
This commit is contained in:
parent
903287ec21
commit
69be378784
4 changed files with 214 additions and 29 deletions
|
|
@ -816,6 +816,7 @@ function summarizeServiceSnapshot(snapshot, { authoritativeHealth = null, active
|
||||||
reachable: snapshot.reachable,
|
reachable: snapshot.reachable,
|
||||||
health_ok: derived.health_ok,
|
health_ok: derived.health_ok,
|
||||||
health_status: derived.status,
|
health_status: derived.status,
|
||||||
|
health_label: derived.label || derived.status,
|
||||||
health_reasons: derived.reasons || [],
|
health_reasons: derived.reasons || [],
|
||||||
highest_alert_severity: derived.highest_alert_severity || null,
|
highest_alert_severity: derived.highest_alert_severity || null,
|
||||||
paused: derived.paused ?? state.paused ?? health.paused ?? null,
|
paused: derived.paused ?? state.paused ?? health.paused ?? null,
|
||||||
|
|
|
||||||
|
|
@ -69,50 +69,71 @@ export function deriveServiceHealth({
|
||||||
const freshnessAgeMs = ageMs(freshnessAt, now);
|
const freshnessAgeMs = ageMs(freshnessAt, now);
|
||||||
const reasons = [];
|
const reasons = [];
|
||||||
let status = paused ? 'paused' : reachable ? 'healthy' : 'offline';
|
let status = paused ? 'paused' : reachable ? 'healthy' : 'offline';
|
||||||
|
let label = status;
|
||||||
|
|
||||||
if (!reachable) {
|
if (!reachable) {
|
||||||
reasons.push('service unreachable');
|
reasons.push('service unreachable');
|
||||||
}
|
label = 'offline';
|
||||||
|
|
||||||
if (health.ok === false && reachable) {
|
|
||||||
status = escalateHealth(status, 'critical');
|
|
||||||
reasons.push(health.reason || 'service health check failed');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (highestAlertSeverity === 'critical') {
|
|
||||||
status = escalateHealth(status, 'critical');
|
|
||||||
reasons.push(`critical alert active (${activeAlerts[0]?.alert_code || 'runtime'})`);
|
|
||||||
} else if (highestAlertSeverity === 'warning') {
|
|
||||||
status = escalateHealth(status, 'warning');
|
|
||||||
reasons.push(`warning alert active (${activeAlerts[0]?.alert_code || 'runtime'})`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (service === 'near-intents-ingest') {
|
if (service === 'near-intents-ingest') {
|
||||||
if (state.ingest?.connected === false) {
|
const ingestClassification = classifyNearIntentsIngestHealth({
|
||||||
|
state,
|
||||||
|
health,
|
||||||
|
activeAlerts,
|
||||||
|
reachable,
|
||||||
|
now,
|
||||||
|
});
|
||||||
|
status = escalateHealth(status, ingestClassification.status);
|
||||||
|
label = ingestClassification.label;
|
||||||
|
reasons.push(...ingestClassification.reasons);
|
||||||
|
} else {
|
||||||
|
if (health.ok === false && reachable) {
|
||||||
status = escalateHealth(status, 'critical');
|
status = escalateHealth(status, 'critical');
|
||||||
reasons.push('websocket disconnected');
|
reasons.push(health.reason || 'service health check failed');
|
||||||
}
|
}
|
||||||
if (state.ingest?.last_matching_quote_at && state.ingest?.last_published_at) {
|
|
||||||
const matchingAgeMs = ageMs(state.ingest.last_matching_quote_at, now);
|
if (highestAlertSeverity === 'critical') {
|
||||||
const publishedAgeMs = ageMs(state.ingest.last_published_at, now);
|
status = escalateHealth(status, 'critical');
|
||||||
if (matchingAgeMs != null && publishedAgeMs != null && publishedAgeMs > matchingAgeMs + 5_000) {
|
reasons.push(`critical alert active (${activeAlerts[0]?.alert_code || 'runtime'})`);
|
||||||
status = escalateHealth(status, 'critical');
|
} else if (highestAlertSeverity === 'warning') {
|
||||||
reasons.push('quote publish path stalled');
|
status = escalateHealth(status, 'warning');
|
||||||
}
|
reasons.push(`warning alert active (${activeAlerts[0]?.alert_code || 'runtime'})`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (service !== 'near-intents-ingest' && status === 'healthy') {
|
||||||
|
label = 'healthy';
|
||||||
|
}
|
||||||
|
if (service !== 'near-intents-ingest' && status === 'paused') {
|
||||||
|
label = 'paused';
|
||||||
|
}
|
||||||
|
if (service !== 'near-intents-ingest' && status === 'critical' && label === 'critical') {
|
||||||
|
label = 'critical';
|
||||||
|
}
|
||||||
|
if (service !== 'near-intents-ingest' && status === 'warning' && label === 'warning') {
|
||||||
|
label = 'warning';
|
||||||
|
}
|
||||||
|
if (service !== 'near-intents-ingest' && status === 'offline') {
|
||||||
|
label = 'offline';
|
||||||
|
}
|
||||||
|
|
||||||
if (service === 'trade-executor' && state.relay?.connected === false) {
|
if (service === 'trade-executor' && state.relay?.connected === false) {
|
||||||
status = escalateHealth(status, 'critical');
|
status = escalateHealth(status, 'critical');
|
||||||
reasons.push('solver relay disconnected');
|
label = 'relay disconnected';
|
||||||
|
if (!reasons.includes('solver relay disconnected')) {
|
||||||
|
reasons.push('solver relay disconnected');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (service === 'history-writer') {
|
if (service === 'history-writer') {
|
||||||
if (state.database_connectivity === false) {
|
if (state.database_connectivity === false) {
|
||||||
status = escalateHealth(status, 'critical');
|
status = escalateHealth(status, 'critical');
|
||||||
|
label = 'database disconnected';
|
||||||
reasons.push('database connectivity failed');
|
reasons.push('database connectivity failed');
|
||||||
} else if (freshnessAgeMs != null && freshnessAgeMs > 45_000) {
|
} else if (freshnessAgeMs != null && freshnessAgeMs > 45_000) {
|
||||||
status = escalateHealth(status, 'warning');
|
status = escalateHealth(status, 'warning');
|
||||||
|
label = 'writer lagging';
|
||||||
reasons.push('writer freshness degraded');
|
reasons.push('writer freshness degraded');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -120,6 +141,7 @@ export function deriveServiceHealth({
|
||||||
if (service === 'operator-dashboard') {
|
if (service === 'operator-dashboard') {
|
||||||
if ((state.source_error_count || 0) > 0 || (health.source_error_count || 0) > 0) {
|
if ((state.source_error_count || 0) > 0 || (health.source_error_count || 0) > 0) {
|
||||||
status = escalateHealth(status, 'warning');
|
status = escalateHealth(status, 'warning');
|
||||||
|
label = 'sources degraded';
|
||||||
reasons.push('dashboard source degraded');
|
reasons.push('dashboard source degraded');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -130,23 +152,96 @@ export function deriveServiceHealth({
|
||||||
&& hasCriticalTruthAlert(activeAlerts, activePair)
|
&& hasCriticalTruthAlert(activeAlerts, activePair)
|
||||||
) {
|
) {
|
||||||
status = escalateHealth(status, 'critical');
|
status = escalateHealth(status, 'critical');
|
||||||
|
if (label === 'healthy' || label === 'warning') {
|
||||||
|
label = 'armed on stale truth';
|
||||||
|
}
|
||||||
reasons.push('armed while critical upstream truth is stale');
|
reasons.push('armed while critical upstream truth is stale');
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
service,
|
service,
|
||||||
status,
|
status,
|
||||||
|
label,
|
||||||
reachable,
|
reachable,
|
||||||
paused,
|
paused,
|
||||||
armed: state.armed ?? null,
|
armed: state.armed ?? null,
|
||||||
health_ok: status === 'healthy' || status === 'paused',
|
health_ok: status === 'healthy' || status === 'paused',
|
||||||
highest_alert_severity: highestAlertSeverity,
|
highest_alert_severity: highestAlertSeverity,
|
||||||
reasons,
|
reasons: dedupeReasons(reasons),
|
||||||
freshness_at: freshnessAt,
|
freshness_at: freshnessAt,
|
||||||
freshness_age_ms: freshnessAgeMs,
|
freshness_age_ms: freshnessAgeMs,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function classifyNearIntentsIngestHealth({
|
||||||
|
state,
|
||||||
|
health,
|
||||||
|
activeAlerts,
|
||||||
|
reachable,
|
||||||
|
now,
|
||||||
|
}) {
|
||||||
|
const reasons = [];
|
||||||
|
const alertCodes = new Set((activeAlerts || []).map((alert) => alert.alert_code));
|
||||||
|
const connected = state.ingest?.connected ?? health.connected ?? null;
|
||||||
|
const matchingAgeMs = ageMs(state.ingest?.last_matching_quote_at, now);
|
||||||
|
const publishedAgeMs = ageMs(state.ingest?.last_published_at, now);
|
||||||
|
|
||||||
|
if (!reachable) {
|
||||||
|
return {
|
||||||
|
status: 'offline',
|
||||||
|
label: 'offline',
|
||||||
|
reasons: ['service unreachable'],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (connected === false || alertCodes.has('near_intents_ingest_disconnected')) {
|
||||||
|
return {
|
||||||
|
status: 'critical',
|
||||||
|
label: 'disconnected',
|
||||||
|
reasons: ['websocket disconnected', 'critical alert active (near_intents_ingest_disconnected)'],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
alertCodes.has('near_intents_publish_stale')
|
||||||
|
|| (
|
||||||
|
state.ingest?.last_matching_quote_at
|
||||||
|
&& state.ingest?.last_published_at
|
||||||
|
&& matchingAgeMs != null
|
||||||
|
&& publishedAgeMs != null
|
||||||
|
&& publishedAgeMs > matchingAgeMs + 5_000
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
return {
|
||||||
|
status: 'critical',
|
||||||
|
label: 'publish stalled',
|
||||||
|
reasons: ['quote publish path stalled', 'critical alert active (near_intents_publish_stale)'],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (alertCodes.has('near_intents_quotes_stale') || health.reason === 'quote truth stale') {
|
||||||
|
reasons.push('connected, no recent quotes for active pair');
|
||||||
|
if (matchingAgeMs != null) {
|
||||||
|
reasons.push(`last matching quote ${matchingAgeMs}ms ago`);
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
status: 'warning',
|
||||||
|
label: 'no recent quotes',
|
||||||
|
reasons,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
status: health.ok === false ? 'warning' : 'healthy',
|
||||||
|
label: connected === true ? 'healthy' : 'unknown',
|
||||||
|
reasons: health.ok === false ? [health.reason || 'service health degraded'] : [],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function dedupeReasons(reasons) {
|
||||||
|
return [...new Set((reasons || []).filter(Boolean))];
|
||||||
|
}
|
||||||
|
|
||||||
export function inferServiceFreshnessTimestamp(service, state = {}, health = {}) {
|
export function inferServiceFreshnessTimestamp(service, state = {}, health = {}) {
|
||||||
switch (service) {
|
switch (service) {
|
||||||
case 'near-intents-ingest':
|
case 'near-intents-ingest':
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ import Pill from './Pill.jsx';
|
||||||
import { formatAge, formatBoolean } from '../lib/format.js';
|
import { formatAge, formatBoolean } from '../lib/format.js';
|
||||||
|
|
||||||
export default function ServiceCard({ service }) {
|
export default function ServiceCard({ service }) {
|
||||||
const healthLabel = service.health_status || (service.health_ok ? 'healthy' : service.reachable ? 'degraded' : 'offline');
|
const healthLabel = service.health_label || service.health_status || (service.health_ok ? 'healthy' : service.reachable ? 'degraded' : 'offline');
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="service-card">
|
<div className="service-card">
|
||||||
|
|
|
||||||
|
|
@ -446,13 +446,14 @@ test('system service health uses sentinel-derived severity so stale ingest is ne
|
||||||
recent_transitions: [],
|
recent_transitions: [],
|
||||||
service_health: [{
|
service_health: [{
|
||||||
service: 'near-intents-ingest',
|
service: 'near-intents-ingest',
|
||||||
status: 'critical',
|
status: 'warning',
|
||||||
|
label: 'no recent quotes',
|
||||||
reachable: true,
|
reachable: true,
|
||||||
paused: false,
|
paused: false,
|
||||||
armed: null,
|
armed: null,
|
||||||
health_ok: false,
|
health_ok: false,
|
||||||
highest_alert_severity: 'critical',
|
highest_alert_severity: 'critical',
|
||||||
reasons: ['critical alert active (near_intents_publish_stale)'],
|
reasons: ['connected, no recent quotes for active pair'],
|
||||||
freshness_at: '2026-04-03T02:12:00.000Z',
|
freshness_at: '2026-04-03T02:12:00.000Z',
|
||||||
freshness_age_ms: 110_880_000,
|
freshness_age_ms: 110_880_000,
|
||||||
}],
|
}],
|
||||||
|
|
@ -463,11 +464,99 @@ test('system service health uses sentinel-derived severity so stale ingest is ne
|
||||||
|
|
||||||
const ingest = bootstrap.system.service_health.find((service) => service.service === 'near-intents-ingest');
|
const ingest = bootstrap.system.service_health.find((service) => service.service === 'near-intents-ingest');
|
||||||
assert.equal(ingest.health_ok, false);
|
assert.equal(ingest.health_ok, false);
|
||||||
assert.equal(ingest.health_status, 'critical');
|
assert.equal(ingest.health_status, 'warning');
|
||||||
assert.match(ingest.health_reasons.join(' '), /critical alert active/);
|
assert.equal(ingest.health_label, 'no recent quotes');
|
||||||
|
assert.match(ingest.health_reasons.join(' '), /connected, no recent quotes/);
|
||||||
assert.equal(bootstrap.status_bar.highest_alert_severity, 'critical');
|
assert.equal(bootstrap.status_bar.highest_alert_severity, 'critical');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('ingest disconnected still renders as a critical transport failure', () => {
|
||||||
|
const config = buildConfig();
|
||||||
|
const bootstrap = buildDashboardBootstrap({
|
||||||
|
config,
|
||||||
|
auth: {
|
||||||
|
authenticated: true,
|
||||||
|
subject: 'local-operator',
|
||||||
|
mode: 'stub',
|
||||||
|
roles: ['operator'],
|
||||||
|
},
|
||||||
|
portfolioMetric: null,
|
||||||
|
inventorySnapshot: null,
|
||||||
|
marketPrice: null,
|
||||||
|
recentQuotes: [],
|
||||||
|
successfulTrades: {
|
||||||
|
page: 1,
|
||||||
|
page_size: 20,
|
||||||
|
total: 0,
|
||||||
|
total_pages: 1,
|
||||||
|
items: [],
|
||||||
|
},
|
||||||
|
successfulTradeSummary: {
|
||||||
|
total: 0,
|
||||||
|
last_successful_trade_at: null,
|
||||||
|
},
|
||||||
|
fundingObservations: [],
|
||||||
|
recentTradeDecisions: [],
|
||||||
|
recentAlertTransitions: [],
|
||||||
|
serviceSnapshots: [
|
||||||
|
{
|
||||||
|
service: 'near-intents-ingest',
|
||||||
|
label: 'Intents Ingest',
|
||||||
|
base_url: 'http://near-intents-ingest',
|
||||||
|
reachable: true,
|
||||||
|
health: { ok: false, connected: false, reason: 'websocket disconnected' },
|
||||||
|
state: {
|
||||||
|
ingest: {
|
||||||
|
connected: false,
|
||||||
|
last_message_at: '2026-04-04T09:00:00.000Z',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
service: 'ops-sentinel',
|
||||||
|
label: 'Ops Sentinel',
|
||||||
|
base_url: 'http://ops-sentinel',
|
||||||
|
reachable: true,
|
||||||
|
health: { ok: true },
|
||||||
|
state: {
|
||||||
|
active_alerts: [{
|
||||||
|
alert_code: 'near_intents_ingest_disconnected',
|
||||||
|
status: 'raised',
|
||||||
|
severity: 'critical',
|
||||||
|
reason: 'websocket disconnected',
|
||||||
|
service_scope: 'near-intents-ingest',
|
||||||
|
pair: config.activePair,
|
||||||
|
raised_at: '2026-04-04T09:30:00.000Z',
|
||||||
|
first_raised_at: '2026-04-04T09:30:00.000Z',
|
||||||
|
cleared_at: null,
|
||||||
|
last_evaluated_at: '2026-04-04T09:30:00.000Z',
|
||||||
|
details: {},
|
||||||
|
}],
|
||||||
|
recent_transitions: [],
|
||||||
|
service_health: [{
|
||||||
|
service: 'near-intents-ingest',
|
||||||
|
status: 'critical',
|
||||||
|
label: 'disconnected',
|
||||||
|
reachable: true,
|
||||||
|
paused: false,
|
||||||
|
armed: null,
|
||||||
|
health_ok: false,
|
||||||
|
highest_alert_severity: 'critical',
|
||||||
|
reasons: ['websocket disconnected', 'critical alert active (near_intents_ingest_disconnected)'],
|
||||||
|
freshness_at: '2026-04-04T09:00:00.000Z',
|
||||||
|
freshness_age_ms: 60_000,
|
||||||
|
}],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
|
const ingest = bootstrap.system.service_health.find((service) => service.service === 'near-intents-ingest');
|
||||||
|
assert.equal(ingest.health_status, 'critical');
|
||||||
|
assert.equal(ingest.health_label, 'disconnected');
|
||||||
|
assert.match(ingest.health_reasons.join(' '), /websocket disconnected/);
|
||||||
|
});
|
||||||
|
|
||||||
test('funding summary includes credited bridge deposits without observer-backed funding observations', () => {
|
test('funding summary includes credited bridge deposits without observer-backed funding observations', () => {
|
||||||
const config = buildConfig();
|
const config = buildConfig();
|
||||||
const bootstrap = buildDashboardBootstrap({
|
const bootstrap = buildDashboardBootstrap({
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue