190 lines
9.6 KiB
Bash
Executable file
190 lines
9.6 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
NAMESPACE="${ORDERBOOKS_K8S_NAMESPACE:-orderbooks}"
|
|
WS_DEPLOYMENT="${ORDERBOOKS_WS_DEPLOYMENT:-orderbooks-ws-recorder}"
|
|
REST_DEPLOYMENT="${ORDERBOOKS_REST_DEPLOYMENT:-orderbooks-collector}"
|
|
WAIT_SECONDS="${ORDERBOOKS_K8S_WS_RELIABILITY_WAIT_SECONDS:-1800}"
|
|
OUTPUT_PATH=""
|
|
RAW_DIR="/var/lib/orderbooks/raw_orderbooks"
|
|
MANIFEST_PATH="/var/lib/orderbooks/manifests/polymarket_ws_recorder_latest.json"
|
|
|
|
usage() {
|
|
cat <<'EOF'
|
|
Usage: scripts/k8s_ws_reliability_check.sh [options]
|
|
|
|
Read-only bounded observation for the Kubernetes websocket recorder canary. It
|
|
writes compact local JSON evidence and does not print secret contents.
|
|
|
|
Options:
|
|
--namespace NAME Namespace. Default: orderbooks.
|
|
--deployment NAME Websocket recorder Deployment. Default: orderbooks-ws-recorder.
|
|
--rest-deployment NAME REST collector Deployment to prove unchanged. Default: orderbooks-collector.
|
|
--wait-seconds N Observation window. Default: 1800.
|
|
--output PATH Local evidence JSON path.
|
|
--raw-dir PATH In-pod raw root. Default: /var/lib/orderbooks/raw_orderbooks.
|
|
--manifest-path PATH In-pod websocket manifest path.
|
|
--help Show help.
|
|
EOF
|
|
}
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--namespace) NAMESPACE="$2"; shift 2 ;;
|
|
--deployment) WS_DEPLOYMENT="$2"; shift 2 ;;
|
|
--rest-deployment) REST_DEPLOYMENT="$2"; shift 2 ;;
|
|
--wait-seconds) WAIT_SECONDS="$2"; shift 2 ;;
|
|
--output) OUTPUT_PATH="$2"; shift 2 ;;
|
|
--raw-dir) RAW_DIR="$2"; shift 2 ;;
|
|
--manifest-path) MANIFEST_PATH="$2"; shift 2 ;;
|
|
--help) usage; exit 0 ;;
|
|
*) echo "unknown argument: $1" >&2; usage >&2; exit 2 ;;
|
|
esac
|
|
done
|
|
|
|
command -v kubectl >/dev/null 2>&1 || { echo "kubectl is required" >&2; exit 2; }
|
|
command -v python3 >/dev/null 2>&1 || { echo "python3 is required" >&2; exit 2; }
|
|
RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
|
|
if [[ -z "$OUTPUT_PATH" ]]; then
|
|
OUTPUT_PATH="data/manifests/ws_reliability_observation_${RUN_ID}.json"
|
|
fi
|
|
mkdir -p "$(dirname "$OUTPUT_PATH")"
|
|
TMPDIR="$(mktemp -d)"
|
|
trap 'rm -rf "$TMPDIR"' EXIT
|
|
|
|
pod_for_deployment() {
|
|
local deployment="$1"
|
|
local selector
|
|
selector="$(kubectl -n "$NAMESPACE" get deployment "$deployment" -o json | python3 -c 'import json, sys; labels=json.load(sys.stdin)["spec"]["selector"]["matchLabels"]; print(",".join(f"{k}={v}" for k,v in sorted(labels.items())))')"
|
|
[[ -n "$selector" ]] || return 1
|
|
kubectl -n "$NAMESPACE" get pod -l "$selector" -o jsonpath='{.items[?(@.status.phase=="Running")].metadata.name}' | awk '{print $1}'
|
|
}
|
|
|
|
kubectl -n "$NAMESPACE" rollout status "deployment/${REST_DEPLOYMENT}" --timeout=120s >/dev/null
|
|
kubectl -n "$NAMESPACE" rollout status "deployment/${WS_DEPLOYMENT}" --timeout=180s >/dev/null
|
|
REST_IMAGE_BEFORE="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.spec.template.spec.containers[0].image}')"
|
|
REST_READY_BEFORE="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.status.readyReplicas}/{.spec.replicas}')"
|
|
WS_IMAGE="$(kubectl -n "$NAMESPACE" get deployment "$WS_DEPLOYMENT" -o jsonpath='{.spec.template.spec.containers[0].image}')"
|
|
WS_POD="$(pod_for_deployment "$WS_DEPLOYMENT")"
|
|
[[ -n "$WS_POD" ]] || { echo "missing running websocket pod" >&2; exit 1; }
|
|
|
|
SUMMARY_PY="$TMPDIR/reliability-summary.py"
|
|
cat >"$SUMMARY_PY" <<'PY_SUMMARY'
|
|
import json, os, time
|
|
from pathlib import Path
|
|
raw=Path(os.environ['RAW_DIR'])
|
|
manifest_path=Path(os.environ['MANIFEST_PATH'])
|
|
o=json.loads(manifest_path.read_text())
|
|
def file_summary(path):
|
|
if not path:
|
|
return None
|
|
p=Path(path)
|
|
if not p.exists():
|
|
return {'path': str(p), 'exists': False}
|
|
st=p.stat()
|
|
return {'path': str(p), 'exists': True, 'bytes': st.st_size, 'mtime_utc': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(st.st_mtime))}
|
|
open_files=o.get('open_files') or []
|
|
ws_open=next((x for x in open_files if 'ws_raw' in str(x.get('path'))), None)
|
|
rest_open=next((x for x in open_files if 'rest_checkpoints' in str(x.get('path'))), None)
|
|
ws_files=sorted(raw.glob('polymarket/ws_raw/**/*.jsonl.gz'), key=lambda p:p.stat().st_mtime)
|
|
rest_files=sorted(raw.glob('polymarket/rest_checkpoints/**/*.jsonl.gz'), key=lambda p:p.stat().st_mtime)
|
|
print(json.dumps({
|
|
'sampled_at_utc': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()),
|
|
'manifest_path': str(manifest_path),
|
|
'manifest_mtime_utc': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(manifest_path.stat().st_mtime)),
|
|
'run_id': o.get('run_id'),
|
|
'status': o.get('status'),
|
|
'gate_status': o.get('gate_status'),
|
|
'updated_at_utc': o.get('updated_at_utc'),
|
|
'seconds_since_last_ws_message': o.get('seconds_since_last_ws_message'),
|
|
'last_successful_ws_message_at_utc': o.get('last_successful_ws_message_at_utc'),
|
|
'current_subscription_token_ids': o.get('current_subscription_token_ids'),
|
|
'current_tracked_market_end_times': o.get('current_tracked_market_end_times'),
|
|
'current_session': o.get('current_session'),
|
|
'recent_sessions': (o.get('recent_sessions') or [])[-5:],
|
|
'counters': o.get('counters') or {},
|
|
'ws_open_file': ws_open,
|
|
'rest_open_file': rest_open,
|
|
'latest_closed_ws': file_summary(ws_files[-1]) if ws_files else None,
|
|
'latest_closed_rest': file_summary(rest_files[-1]) if rest_files else None,
|
|
}, sort_keys=True))
|
|
PY_SUMMARY
|
|
|
|
sample_pod() {
|
|
kubectl -n "$NAMESPACE" exec "$WS_POD" -- env RAW_DIR="$RAW_DIR" MANIFEST_PATH="$MANIFEST_PATH" python3 -c "$(cat "$SUMMARY_PY")"
|
|
}
|
|
|
|
START_JSON="$(sample_pod)"
|
|
sleep "$WAIT_SECONDS"
|
|
kubectl -n "$NAMESPACE" rollout status "deployment/${WS_DEPLOYMENT}" --timeout=180s >/dev/null
|
|
WS_POD_AFTER="$(pod_for_deployment "$WS_DEPLOYMENT")"
|
|
if [[ "$WS_POD_AFTER" != "$WS_POD" ]]; then
|
|
WS_POD="$WS_POD_AFTER"
|
|
fi
|
|
END_JSON="$(sample_pod)"
|
|
REST_IMAGE_AFTER="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.spec.template.spec.containers[0].image}')"
|
|
REST_READY_AFTER="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.status.readyReplicas}/{.spec.replicas}')"
|
|
|
|
WRITE_PY="$TMPDIR/write.py"
|
|
cat >"$WRITE_PY" <<'PY_WRITE'
|
|
import datetime as dt, json, sys
|
|
from pathlib import Path
|
|
(output_path, namespace, ws_deployment, rest_deployment, wait_seconds, ws_image, rest_image_before, rest_ready_before, rest_image_after, rest_ready_after)=sys.argv[1:11]
|
|
start=json.loads(sys.stdin.readline())
|
|
end=json.loads(sys.stdin.readline())
|
|
def c(obj, key):
|
|
return (obj.get('counters') or {}).get(key)
|
|
def num(value):
|
|
return 0 if value is None else value
|
|
ws_delta=num(c(end,'websocket_message_count'))-num(c(start,'websocket_message_count'))
|
|
rest_delta=num(c(end,'rest_success_count'))-num(c(start,'rest_success_count'))
|
|
stale_delta=num(c(end,'stale_feed_count'))-num(c(start,'stale_feed_count'))
|
|
reconnect_delta=num(c(end,'reconnect_count'))-num(c(start,'reconnect_count'))
|
|
parse_errors=num(c(end,'websocket_parse_error_count'))
|
|
threshold=float(((end.get('counters') or {}).get('stale_feed_threshold_seconds') or 90))
|
|
seconds_since=end.get('seconds_since_last_ws_message')
|
|
active_tokens=len(end.get('current_subscription_token_ids') or [])
|
|
start_ws_open=start.get('ws_open_file') or {}
|
|
end_ws_open=end.get('ws_open_file') or {}
|
|
file_advanced=False
|
|
if start_ws_open and end_ws_open:
|
|
file_advanced = end_ws_open.get('path') != start_ws_open.get('path') or num(end_ws_open.get('bytes')) > num(start_ws_open.get('bytes')) or end_ws_open.get('mtime_utc') != start_ws_open.get('mtime_utc')
|
|
reasons=[]
|
|
if active_tokens and ws_delta <= 0:
|
|
reasons.append('websocket_message_count did not increase while active tokens existed')
|
|
if active_tokens and not file_advanced:
|
|
reasons.append('websocket open file did not advance while active tokens existed')
|
|
if rest_delta <= 0:
|
|
reasons.append('REST checkpoint success count did not increase')
|
|
if parse_errors != 0:
|
|
reasons.append('websocket_parse_error_count is nonzero')
|
|
if active_tokens and seconds_since is not None and seconds_since > max(threshold * 2, 180):
|
|
reasons.append('seconds_since_last_ws_message exceeded reliability threshold')
|
|
if reconnect_delta > 3 or stale_delta > 3:
|
|
reasons.append('reconnect/stale counters grew rapidly during observation')
|
|
if rest_image_before != rest_image_after or rest_ready_before != rest_ready_after:
|
|
reasons.append('REST collector image/readiness changed')
|
|
gate='WS_RELIABILITY_OBSERVATION_PASS' if not reasons else 'BLOCKED_WS_STALE_LOOP'
|
|
manifest={
|
|
'schema_name':'ws_reliability_observation',
|
|
'schema_version':1,
|
|
'written_at_utc':dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00','Z'),
|
|
'gate_status':gate,
|
|
'namespace':namespace,
|
|
'ws_deployment':ws_deployment,
|
|
'rest_deployment':rest_deployment,
|
|
'wait_seconds':int(float(wait_seconds)),
|
|
'ws_image':ws_image,
|
|
'rest_collector':{'image_before':rest_image_before,'ready_before':rest_ready_before,'image_after':rest_image_after,'ready_after':rest_ready_after,'unchanged':rest_image_before==rest_image_after and rest_ready_before==rest_ready_after},
|
|
'start':start,
|
|
'end':end,
|
|
'deltas':{'websocket_message_count':ws_delta,'rest_success_count':rest_delta,'stale_feed_count':stale_delta,'reconnect_count':reconnect_delta},
|
|
'file_advanced':file_advanced,
|
|
'reasons':reasons,
|
|
'production_ready':False,
|
|
}
|
|
Path(output_path).write_text(json.dumps(manifest, indent=2, sort_keys=True)+'\n')
|
|
print(json.dumps({'gate_status':gate,'evidence_path':output_path,'deltas':manifest['deltas'],'reasons':reasons}, indent=2, sort_keys=True))
|
|
raise SystemExit(0 if gate == 'WS_RELIABILITY_OBSERVATION_PASS' else 1)
|
|
PY_WRITE
|
|
printf '%s\n%s\n' "$START_JSON" "$END_JSON" | python3 "$WRITE_PY" "$OUTPUT_PATH" "$NAMESPACE" "$WS_DEPLOYMENT" "$REST_DEPLOYMENT" "$WAIT_SECONDS" "$WS_IMAGE" "$REST_IMAGE_BEFORE" "$REST_READY_BEFORE" "$REST_IMAGE_AFTER" "$REST_READY_AFTER"
|