orderbooks/scripts/k8s_ws_reliability_check.sh

190 lines
9.6 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
NAMESPACE="${ORDERBOOKS_K8S_NAMESPACE:-orderbooks}"
WS_DEPLOYMENT="${ORDERBOOKS_WS_DEPLOYMENT:-orderbooks-ws-recorder}"
REST_DEPLOYMENT="${ORDERBOOKS_REST_DEPLOYMENT:-orderbooks-collector}"
WAIT_SECONDS="${ORDERBOOKS_K8S_WS_RELIABILITY_WAIT_SECONDS:-1800}"
OUTPUT_PATH=""
RAW_DIR="/var/lib/orderbooks/raw_orderbooks"
MANIFEST_PATH="/var/lib/orderbooks/manifests/polymarket_ws_recorder_latest.json"
usage() {
cat <<'EOF'
Usage: scripts/k8s_ws_reliability_check.sh [options]
Read-only bounded observation for the Kubernetes websocket recorder canary. It
writes compact local JSON evidence and does not print secret contents.
Options:
--namespace NAME Namespace. Default: orderbooks.
--deployment NAME Websocket recorder Deployment. Default: orderbooks-ws-recorder.
--rest-deployment NAME REST collector Deployment to prove unchanged. Default: orderbooks-collector.
--wait-seconds N Observation window. Default: 1800.
--output PATH Local evidence JSON path.
--raw-dir PATH In-pod raw root. Default: /var/lib/orderbooks/raw_orderbooks.
--manifest-path PATH In-pod websocket manifest path.
--help Show help.
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--namespace) NAMESPACE="$2"; shift 2 ;;
--deployment) WS_DEPLOYMENT="$2"; shift 2 ;;
--rest-deployment) REST_DEPLOYMENT="$2"; shift 2 ;;
--wait-seconds) WAIT_SECONDS="$2"; shift 2 ;;
--output) OUTPUT_PATH="$2"; shift 2 ;;
--raw-dir) RAW_DIR="$2"; shift 2 ;;
--manifest-path) MANIFEST_PATH="$2"; shift 2 ;;
--help) usage; exit 0 ;;
*) echo "unknown argument: $1" >&2; usage >&2; exit 2 ;;
esac
done
command -v kubectl >/dev/null 2>&1 || { echo "kubectl is required" >&2; exit 2; }
command -v python3 >/dev/null 2>&1 || { echo "python3 is required" >&2; exit 2; }
RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
if [[ -z "$OUTPUT_PATH" ]]; then
OUTPUT_PATH="data/manifests/ws_reliability_observation_${RUN_ID}.json"
fi
mkdir -p "$(dirname "$OUTPUT_PATH")"
TMPDIR="$(mktemp -d)"
trap 'rm -rf "$TMPDIR"' EXIT
pod_for_deployment() {
local deployment="$1"
local selector
selector="$(kubectl -n "$NAMESPACE" get deployment "$deployment" -o json | python3 -c 'import json, sys; labels=json.load(sys.stdin)["spec"]["selector"]["matchLabels"]; print(",".join(f"{k}={v}" for k,v in sorted(labels.items())))')"
[[ -n "$selector" ]] || return 1
kubectl -n "$NAMESPACE" get pod -l "$selector" -o jsonpath='{.items[?(@.status.phase=="Running")].metadata.name}' | awk '{print $1}'
}
kubectl -n "$NAMESPACE" rollout status "deployment/${REST_DEPLOYMENT}" --timeout=120s >/dev/null
kubectl -n "$NAMESPACE" rollout status "deployment/${WS_DEPLOYMENT}" --timeout=180s >/dev/null
REST_IMAGE_BEFORE="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.spec.template.spec.containers[0].image}')"
REST_READY_BEFORE="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.status.readyReplicas}/{.spec.replicas}')"
WS_IMAGE="$(kubectl -n "$NAMESPACE" get deployment "$WS_DEPLOYMENT" -o jsonpath='{.spec.template.spec.containers[0].image}')"
WS_POD="$(pod_for_deployment "$WS_DEPLOYMENT")"
[[ -n "$WS_POD" ]] || { echo "missing running websocket pod" >&2; exit 1; }
SUMMARY_PY="$TMPDIR/reliability-summary.py"
cat >"$SUMMARY_PY" <<'PY_SUMMARY'
import json, os, time
from pathlib import Path
raw=Path(os.environ['RAW_DIR'])
manifest_path=Path(os.environ['MANIFEST_PATH'])
o=json.loads(manifest_path.read_text())
def file_summary(path):
if not path:
return None
p=Path(path)
if not p.exists():
return {'path': str(p), 'exists': False}
st=p.stat()
return {'path': str(p), 'exists': True, 'bytes': st.st_size, 'mtime_utc': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(st.st_mtime))}
open_files=o.get('open_files') or []
ws_open=next((x for x in open_files if 'ws_raw' in str(x.get('path'))), None)
rest_open=next((x for x in open_files if 'rest_checkpoints' in str(x.get('path'))), None)
ws_files=sorted(raw.glob('polymarket/ws_raw/**/*.jsonl.gz'), key=lambda p:p.stat().st_mtime)
rest_files=sorted(raw.glob('polymarket/rest_checkpoints/**/*.jsonl.gz'), key=lambda p:p.stat().st_mtime)
print(json.dumps({
'sampled_at_utc': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()),
'manifest_path': str(manifest_path),
'manifest_mtime_utc': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(manifest_path.stat().st_mtime)),
'run_id': o.get('run_id'),
'status': o.get('status'),
'gate_status': o.get('gate_status'),
'updated_at_utc': o.get('updated_at_utc'),
'seconds_since_last_ws_message': o.get('seconds_since_last_ws_message'),
'last_successful_ws_message_at_utc': o.get('last_successful_ws_message_at_utc'),
'current_subscription_token_ids': o.get('current_subscription_token_ids'),
'current_tracked_market_end_times': o.get('current_tracked_market_end_times'),
'current_session': o.get('current_session'),
'recent_sessions': (o.get('recent_sessions') or [])[-5:],
'counters': o.get('counters') or {},
'ws_open_file': ws_open,
'rest_open_file': rest_open,
'latest_closed_ws': file_summary(ws_files[-1]) if ws_files else None,
'latest_closed_rest': file_summary(rest_files[-1]) if rest_files else None,
}, sort_keys=True))
PY_SUMMARY
sample_pod() {
kubectl -n "$NAMESPACE" exec "$WS_POD" -- env RAW_DIR="$RAW_DIR" MANIFEST_PATH="$MANIFEST_PATH" python3 -c "$(cat "$SUMMARY_PY")"
}
START_JSON="$(sample_pod)"
sleep "$WAIT_SECONDS"
kubectl -n "$NAMESPACE" rollout status "deployment/${WS_DEPLOYMENT}" --timeout=180s >/dev/null
WS_POD_AFTER="$(pod_for_deployment "$WS_DEPLOYMENT")"
if [[ "$WS_POD_AFTER" != "$WS_POD" ]]; then
WS_POD="$WS_POD_AFTER"
fi
END_JSON="$(sample_pod)"
REST_IMAGE_AFTER="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.spec.template.spec.containers[0].image}')"
REST_READY_AFTER="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.status.readyReplicas}/{.spec.replicas}')"
WRITE_PY="$TMPDIR/write.py"
cat >"$WRITE_PY" <<'PY_WRITE'
import datetime as dt, json, sys
from pathlib import Path
(output_path, namespace, ws_deployment, rest_deployment, wait_seconds, ws_image, rest_image_before, rest_ready_before, rest_image_after, rest_ready_after)=sys.argv[1:11]
start=json.loads(sys.stdin.readline())
end=json.loads(sys.stdin.readline())
def c(obj, key):
return (obj.get('counters') or {}).get(key)
def num(value):
return 0 if value is None else value
ws_delta=num(c(end,'websocket_message_count'))-num(c(start,'websocket_message_count'))
rest_delta=num(c(end,'rest_success_count'))-num(c(start,'rest_success_count'))
stale_delta=num(c(end,'stale_feed_count'))-num(c(start,'stale_feed_count'))
reconnect_delta=num(c(end,'reconnect_count'))-num(c(start,'reconnect_count'))
parse_errors=num(c(end,'websocket_parse_error_count'))
threshold=float(((end.get('counters') or {}).get('stale_feed_threshold_seconds') or 90))
seconds_since=end.get('seconds_since_last_ws_message')
active_tokens=len(end.get('current_subscription_token_ids') or [])
start_ws_open=start.get('ws_open_file') or {}
end_ws_open=end.get('ws_open_file') or {}
file_advanced=False
if start_ws_open and end_ws_open:
file_advanced = end_ws_open.get('path') != start_ws_open.get('path') or num(end_ws_open.get('bytes')) > num(start_ws_open.get('bytes')) or end_ws_open.get('mtime_utc') != start_ws_open.get('mtime_utc')
reasons=[]
if active_tokens and ws_delta <= 0:
reasons.append('websocket_message_count did not increase while active tokens existed')
if active_tokens and not file_advanced:
reasons.append('websocket open file did not advance while active tokens existed')
if rest_delta <= 0:
reasons.append('REST checkpoint success count did not increase')
if parse_errors != 0:
reasons.append('websocket_parse_error_count is nonzero')
if active_tokens and seconds_since is not None and seconds_since > max(threshold * 2, 180):
reasons.append('seconds_since_last_ws_message exceeded reliability threshold')
if reconnect_delta > 3 or stale_delta > 3:
reasons.append('reconnect/stale counters grew rapidly during observation')
if rest_image_before != rest_image_after or rest_ready_before != rest_ready_after:
reasons.append('REST collector image/readiness changed')
gate='WS_RELIABILITY_OBSERVATION_PASS' if not reasons else 'BLOCKED_WS_STALE_LOOP'
manifest={
'schema_name':'ws_reliability_observation',
'schema_version':1,
'written_at_utc':dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00','Z'),
'gate_status':gate,
'namespace':namespace,
'ws_deployment':ws_deployment,
'rest_deployment':rest_deployment,
'wait_seconds':int(float(wait_seconds)),
'ws_image':ws_image,
'rest_collector':{'image_before':rest_image_before,'ready_before':rest_ready_before,'image_after':rest_image_after,'ready_after':rest_ready_after,'unchanged':rest_image_before==rest_image_after and rest_ready_before==rest_ready_after},
'start':start,
'end':end,
'deltas':{'websocket_message_count':ws_delta,'rest_success_count':rest_delta,'stale_feed_count':stale_delta,'reconnect_count':reconnect_delta},
'file_advanced':file_advanced,
'reasons':reasons,
'production_ready':False,
}
Path(output_path).write_text(json.dumps(manifest, indent=2, sort_keys=True)+'\n')
print(json.dumps({'gate_status':gate,'evidence_path':output_path,'deltas':manifest['deltas'],'reasons':reasons}, indent=2, sort_keys=True))
raise SystemExit(0 if gate == 'WS_RELIABILITY_OBSERVATION_PASS' else 1)
PY_WRITE
printf '%s\n%s\n' "$START_JSON" "$END_JSON" | python3 "$WRITE_PY" "$OUTPUT_PATH" "$NAMESPACE" "$WS_DEPLOYMENT" "$REST_DEPLOYMENT" "$WAIT_SECONDS" "$WS_IMAGE" "$REST_IMAGE_BEFORE" "$REST_READY_BEFORE" "$REST_IMAGE_AFTER" "$REST_READY_AFTER"