#!/usr/bin/env bash set -euo pipefail NAMESPACE="${ORDERBOOKS_K8S_NAMESPACE:-orderbooks}" WS_DEPLOYMENT="${ORDERBOOKS_WS_DEPLOYMENT:-orderbooks-ws-recorder}" REST_DEPLOYMENT="${ORDERBOOKS_REST_DEPLOYMENT:-orderbooks-collector}" WAIT_SECONDS="${ORDERBOOKS_K8S_WS_RELIABILITY_WAIT_SECONDS:-1800}" OUTPUT_PATH="" RAW_DIR="/var/lib/orderbooks/raw_orderbooks" MANIFEST_PATH="/var/lib/orderbooks/manifests/polymarket_ws_recorder_latest.json" usage() { cat <<'EOF' Usage: scripts/k8s_ws_reliability_check.sh [options] Read-only bounded observation for the Kubernetes websocket recorder canary. It writes compact local JSON evidence and does not print secret contents. Options: --namespace NAME Namespace. Default: orderbooks. --deployment NAME Websocket recorder Deployment. Default: orderbooks-ws-recorder. --rest-deployment NAME REST collector Deployment to prove unchanged. Default: orderbooks-collector. --wait-seconds N Observation window. Default: 1800. --output PATH Local evidence JSON path. --raw-dir PATH In-pod raw root. Default: /var/lib/orderbooks/raw_orderbooks. --manifest-path PATH In-pod websocket manifest path. --help Show help. EOF } while [[ $# -gt 0 ]]; do case "$1" in --namespace) NAMESPACE="$2"; shift 2 ;; --deployment) WS_DEPLOYMENT="$2"; shift 2 ;; --rest-deployment) REST_DEPLOYMENT="$2"; shift 2 ;; --wait-seconds) WAIT_SECONDS="$2"; shift 2 ;; --output) OUTPUT_PATH="$2"; shift 2 ;; --raw-dir) RAW_DIR="$2"; shift 2 ;; --manifest-path) MANIFEST_PATH="$2"; shift 2 ;; --help) usage; exit 0 ;; *) echo "unknown argument: $1" >&2; usage >&2; exit 2 ;; esac done command -v kubectl >/dev/null 2>&1 || { echo "kubectl is required" >&2; exit 2; } command -v python3 >/dev/null 2>&1 || { echo "python3 is required" >&2; exit 2; } RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)" if [[ -z "$OUTPUT_PATH" ]]; then OUTPUT_PATH="data/manifests/ws_reliability_observation_${RUN_ID}.json" fi mkdir -p "$(dirname "$OUTPUT_PATH")" TMPDIR="$(mktemp -d)" trap 'rm -rf "$TMPDIR"' EXIT pod_for_deployment() { local deployment="$1" local selector selector="$(kubectl -n "$NAMESPACE" get deployment "$deployment" -o json | python3 -c 'import json, sys; labels=json.load(sys.stdin)["spec"]["selector"]["matchLabels"]; print(",".join(f"{k}={v}" for k,v in sorted(labels.items())))')" [[ -n "$selector" ]] || return 1 kubectl -n "$NAMESPACE" get pod -l "$selector" -o jsonpath='{.items[?(@.status.phase=="Running")].metadata.name}' | awk '{print $1}' } kubectl -n "$NAMESPACE" rollout status "deployment/${REST_DEPLOYMENT}" --timeout=120s >/dev/null kubectl -n "$NAMESPACE" rollout status "deployment/${WS_DEPLOYMENT}" --timeout=180s >/dev/null REST_IMAGE_BEFORE="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.spec.template.spec.containers[0].image}')" REST_READY_BEFORE="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.status.readyReplicas}/{.spec.replicas}')" WS_IMAGE="$(kubectl -n "$NAMESPACE" get deployment "$WS_DEPLOYMENT" -o jsonpath='{.spec.template.spec.containers[0].image}')" WS_POD="$(pod_for_deployment "$WS_DEPLOYMENT")" [[ -n "$WS_POD" ]] || { echo "missing running websocket pod" >&2; exit 1; } SUMMARY_PY="$TMPDIR/reliability-summary.py" cat >"$SUMMARY_PY" <<'PY_SUMMARY' import json, os, time from pathlib import Path raw=Path(os.environ['RAW_DIR']) manifest_path=Path(os.environ['MANIFEST_PATH']) o=json.loads(manifest_path.read_text()) def file_summary(path): if not path: return None p=Path(path) if not p.exists(): return {'path': str(p), 'exists': False} st=p.stat() return {'path': str(p), 'exists': True, 'bytes': st.st_size, 'mtime_utc': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(st.st_mtime))} open_files=o.get('open_files') or [] ws_open=next((x for x in open_files if 'ws_raw' in str(x.get('path'))), None) rest_open=next((x for x in open_files if 'rest_checkpoints' in str(x.get('path'))), None) ws_files=sorted(raw.glob('polymarket/ws_raw/**/*.jsonl.gz'), key=lambda p:p.stat().st_mtime) rest_files=sorted(raw.glob('polymarket/rest_checkpoints/**/*.jsonl.gz'), key=lambda p:p.stat().st_mtime) print(json.dumps({ 'sampled_at_utc': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()), 'manifest_path': str(manifest_path), 'manifest_mtime_utc': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(manifest_path.stat().st_mtime)), 'run_id': o.get('run_id'), 'status': o.get('status'), 'gate_status': o.get('gate_status'), 'updated_at_utc': o.get('updated_at_utc'), 'seconds_since_last_ws_message': o.get('seconds_since_last_ws_message'), 'last_successful_ws_message_at_utc': o.get('last_successful_ws_message_at_utc'), 'current_subscription_token_ids': o.get('current_subscription_token_ids'), 'current_tracked_market_end_times': o.get('current_tracked_market_end_times'), 'current_session': o.get('current_session'), 'recent_sessions': (o.get('recent_sessions') or [])[-5:], 'counters': o.get('counters') or {}, 'ws_open_file': ws_open, 'rest_open_file': rest_open, 'latest_closed_ws': file_summary(ws_files[-1]) if ws_files else None, 'latest_closed_rest': file_summary(rest_files[-1]) if rest_files else None, }, sort_keys=True)) PY_SUMMARY sample_pod() { kubectl -n "$NAMESPACE" exec "$WS_POD" -- env RAW_DIR="$RAW_DIR" MANIFEST_PATH="$MANIFEST_PATH" python3 -c "$(cat "$SUMMARY_PY")" } START_JSON="$(sample_pod)" sleep "$WAIT_SECONDS" kubectl -n "$NAMESPACE" rollout status "deployment/${WS_DEPLOYMENT}" --timeout=180s >/dev/null WS_POD_AFTER="$(pod_for_deployment "$WS_DEPLOYMENT")" if [[ "$WS_POD_AFTER" != "$WS_POD" ]]; then WS_POD="$WS_POD_AFTER" fi END_JSON="$(sample_pod)" REST_IMAGE_AFTER="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.spec.template.spec.containers[0].image}')" REST_READY_AFTER="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.status.readyReplicas}/{.spec.replicas}')" WRITE_PY="$TMPDIR/write.py" cat >"$WRITE_PY" <<'PY_WRITE' import datetime as dt, json, sys from pathlib import Path (output_path, namespace, ws_deployment, rest_deployment, wait_seconds, ws_image, rest_image_before, rest_ready_before, rest_image_after, rest_ready_after)=sys.argv[1:11] start=json.loads(sys.stdin.readline()) end=json.loads(sys.stdin.readline()) def c(obj, key): return (obj.get('counters') or {}).get(key) def num(value): return 0 if value is None else value ws_delta=num(c(end,'websocket_message_count'))-num(c(start,'websocket_message_count')) rest_delta=num(c(end,'rest_success_count'))-num(c(start,'rest_success_count')) stale_delta=num(c(end,'stale_feed_count'))-num(c(start,'stale_feed_count')) reconnect_delta=num(c(end,'reconnect_count'))-num(c(start,'reconnect_count')) parse_errors=num(c(end,'websocket_parse_error_count')) threshold=float(((end.get('counters') or {}).get('stale_feed_threshold_seconds') or 90)) seconds_since=end.get('seconds_since_last_ws_message') active_tokens=len(end.get('current_subscription_token_ids') or []) start_ws_open=start.get('ws_open_file') or {} end_ws_open=end.get('ws_open_file') or {} file_advanced=False if start_ws_open and end_ws_open: file_advanced = end_ws_open.get('path') != start_ws_open.get('path') or num(end_ws_open.get('bytes')) > num(start_ws_open.get('bytes')) or end_ws_open.get('mtime_utc') != start_ws_open.get('mtime_utc') reasons=[] if active_tokens and ws_delta <= 0: reasons.append('websocket_message_count did not increase while active tokens existed') if active_tokens and not file_advanced: reasons.append('websocket open file did not advance while active tokens existed') if rest_delta <= 0: reasons.append('REST checkpoint success count did not increase') if parse_errors != 0: reasons.append('websocket_parse_error_count is nonzero') if active_tokens and seconds_since is not None and seconds_since > max(threshold * 2, 180): reasons.append('seconds_since_last_ws_message exceeded reliability threshold') if reconnect_delta > 3 or stale_delta > 3: reasons.append('reconnect/stale counters grew rapidly during observation') if rest_image_before != rest_image_after or rest_ready_before != rest_ready_after: reasons.append('REST collector image/readiness changed') gate='WS_RELIABILITY_OBSERVATION_PASS' if not reasons else 'BLOCKED_WS_STALE_LOOP' manifest={ 'schema_name':'ws_reliability_observation', 'schema_version':1, 'written_at_utc':dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00','Z'), 'gate_status':gate, 'namespace':namespace, 'ws_deployment':ws_deployment, 'rest_deployment':rest_deployment, 'wait_seconds':int(float(wait_seconds)), 'ws_image':ws_image, 'rest_collector':{'image_before':rest_image_before,'ready_before':rest_ready_before,'image_after':rest_image_after,'ready_after':rest_ready_after,'unchanged':rest_image_before==rest_image_after and rest_ready_before==rest_ready_after}, 'start':start, 'end':end, 'deltas':{'websocket_message_count':ws_delta,'rest_success_count':rest_delta,'stale_feed_count':stale_delta,'reconnect_count':reconnect_delta}, 'file_advanced':file_advanced, 'reasons':reasons, 'production_ready':False, } Path(output_path).write_text(json.dumps(manifest, indent=2, sort_keys=True)+'\n') print(json.dumps({'gate_status':gate,'evidence_path':output_path,'deltas':manifest['deltas'],'reasons':reasons}, indent=2, sort_keys=True)) raise SystemExit(0 if gate == 'WS_RELIABILITY_OBSERVATION_PASS' else 1) PY_WRITE printf '%s\n%s\n' "$START_JSON" "$END_JSON" | python3 "$WRITE_PY" "$OUTPUT_PATH" "$NAMESPACE" "$WS_DEPLOYMENT" "$REST_DEPLOYMENT" "$WAIT_SECONDS" "$WS_IMAGE" "$REST_IMAGE_BEFORE" "$REST_READY_BEFORE" "$REST_IMAGE_AFTER" "$REST_READY_AFTER"