#!/usr/bin/env bash set -euo pipefail NAMESPACE="${ORDERBOOKS_K8S_NAMESPACE:-orderbooks}" WS_DEPLOYMENT="${ORDERBOOKS_WS_DEPLOYMENT:-orderbooks-ws-recorder}" REST_DEPLOYMENT="${ORDERBOOKS_REST_DEPLOYMENT:-orderbooks-collector}" UPLOADER_CRONJOB="${ORDERBOOKS_UPLOADER_CRONJOB:-orderbooks-uploader}" WAIT_SECONDS="${ORDERBOOKS_K8S_WS_SMOKE_WAIT_SECONDS:-900}" OUTPUT_PATH="" RAW_DIR="/var/lib/orderbooks/raw_orderbooks" MANIFEST_DIR="/var/lib/orderbooks/manifests" UPLOAD_MIN_AGE_SECONDS="600" SMOKE_UPLOAD_RETENTION_DAYS="3" usage() { cat <<'EOF' Usage: scripts/k8s_ws_runtime_smoke_check.sh [options] Verifies the Kubernetes websocket recorder canary and writes compact local JSON evidence. The script does not print secret contents. Options: --namespace NAME Namespace. Default: orderbooks. --deployment NAME Websocket recorder Deployment. Default: orderbooks-ws-recorder. --rest-deployment NAME Existing REST Deployment. Default: orderbooks-collector. --cronjob NAME Production uploader CronJob. Default: orderbooks-uploader. --wait-seconds N Max wait for runtime evidence. Default: 900. --output PATH Local smoke evidence path. --raw-dir PATH In-pod raw root. Default: /var/lib/orderbooks/raw_orderbooks. --manifest-dir PATH In-pod manifest dir. Default: /var/lib/orderbooks/manifests. --upload-min-age-seconds N Production upload min age to record. Default: 600. --smoke-upload-retention-days N Retention used by one-off smoke upload Job. Default: 3. --help Show help. EOF } while [[ $# -gt 0 ]]; do case "$1" in --namespace) NAMESPACE="$2"; shift 2 ;; --deployment) WS_DEPLOYMENT="$2"; shift 2 ;; --rest-deployment) REST_DEPLOYMENT="$2"; shift 2 ;; --cronjob) UPLOADER_CRONJOB="$2"; shift 2 ;; --wait-seconds) WAIT_SECONDS="$2"; shift 2 ;; --output) OUTPUT_PATH="$2"; shift 2 ;; --raw-dir) RAW_DIR="$2"; shift 2 ;; --manifest-dir) MANIFEST_DIR="$2"; shift 2 ;; --upload-min-age-seconds) UPLOAD_MIN_AGE_SECONDS="$2"; shift 2 ;; --smoke-upload-retention-days) SMOKE_UPLOAD_RETENTION_DAYS="$2"; shift 2 ;; --help) usage; exit 0 ;; *) echo "unknown argument: $1" >&2; usage >&2; exit 2 ;; esac done command -v kubectl >/dev/null 2>&1 || { echo "kubectl is required" >&2; exit 2; } RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)" if [[ -z "$OUTPUT_PATH" ]]; then OUTPUT_PATH="data/manifests/k8s_ws_runtime_smoke_${RUN_ID}.json" fi mkdir -p "$(dirname "$OUTPUT_PATH")" TMPDIR="$(mktemp -d)" trap 'rm -rf "$TMPDIR"' EXIT write_blocked() { local gate="$1" local reason="$2" python3 - "$OUTPUT_PATH" "$gate" "$reason" <<'PY_BLOCKED' import datetime as dt import json import sys from pathlib import Path path=Path(sys.argv[1]) manifest={ "schema_name":"k8s_ws_runtime_smoke", "schema_version":1, "written_at_utc":dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00','Z'), "gate_status":sys.argv[2], "reason":sys.argv[3], "production_ready":False, } path.write_text(json.dumps(manifest,indent=2,sort_keys=True)+'\n') PY_BLOCKED } pod_for_deployment() { local deployment="$1" local selector selector="$(kubectl -n "$NAMESPACE" get deployment "$deployment" -o jsonpath='{range $k,$v:=.spec.selector.matchLabels}{$k}{"="}{$v}{","}{end}' | sed 's/,$//')" kubectl -n "$NAMESPACE" get pod -l "$selector" -o jsonpath='{.items[?(@.status.phase=="Running")].metadata.name}' | awk '{print $1}' } REST_IMAGE_BEFORE="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.spec.template.spec.containers[0].image}')" REST_READY_BEFORE="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.status.readyReplicas}/{.spec.replicas}')" kubectl -n "$NAMESPACE" rollout status "deployment/${REST_DEPLOYMENT}" --timeout=120s >/dev/null kubectl -n "$NAMESPACE" rollout status "deployment/${WS_DEPLOYMENT}" --timeout=300s >/dev/null REST_POD="$(pod_for_deployment "$REST_DEPLOYMENT")" WS_POD="$(pod_for_deployment "$WS_DEPLOYMENT")" if [[ -z "$REST_POD" || -z "$WS_POD" ]]; then write_blocked "BLOCKED_K8S_RUNTIME_FAILURE" "missing running REST or websocket pod" exit 1 fi SUMMARY_PY="${TMPDIR}/summary.py" cat >"${SUMMARY_PY}" <<'PY_SUMMARY' import gzip, hashlib, json, os from pathlib import Path raw_root=Path(os.environ.get('RAW_DIR','/var/lib/orderbooks/raw_orderbooks')) manifest_dir=Path(os.environ.get('MANIFEST_DIR','/var/lib/orderbooks/manifests')) check_path=os.environ.get('CHECK_PATH') or '' def sha(path): h=hashlib.sha256() with path.open('rb') as f: for chunk in iter(lambda:f.read(1024*1024), b''): h.update(chunk) return h.hexdigest() def count_gz(path): rows=0 first=None with gzip.open(path,'rt',encoding='utf-8') as f: for line in f: if line.strip(): obj=json.loads(line) if first is None: first=obj rows+=1 return rows, first def summarize_gz(path): rows, first = count_gz(path) return {'path':str(path),'rows':rows,'bytes':path.stat().st_size,'sha256':sha(path),'first_schema':first.get('schema_name') if isinstance(first,dict) else None} def load_json(path): return json.loads(path.read_text()) ws_files=sorted(raw_root.glob('polymarket/ws_raw/**/*.jsonl.gz')) rest_files=sorted(raw_root.glob('polymarket/rest_checkpoints/**/*.jsonl.gz')) open_files=sorted([p for p in raw_root.glob('polymarket/**/*') if p.is_file() and (p.name.startswith('.') or p.name.endswith(('.open','.tmp','.partial')))]) recorder_manifests=sorted(manifest_dir.glob('polymarket_ws_recorder_*.json'), key=lambda p: p.stat().st_mtime) upload_manifests=sorted(manifest_dir.glob('upload_archive_*.json'), key=lambda p: p.stat().st_mtime) latest_manifest=load_json(recorder_manifests[-1]) if recorder_manifests else None latest_upload=load_json(upload_manifests[-1]) if upload_manifests else None result={ 'ws_file_count':len(ws_files), 'rest_file_count':len(rest_files), 'open_or_temp_files':[str(p) for p in open_files[:20]], 'open_or_temp_file_count':len(open_files), 'recorder_manifest_count':len(recorder_manifests), 'upload_manifest_count':len(upload_manifests), 'latest_manifest_path':str(recorder_manifests[-1]) if recorder_manifests else None, 'latest_upload_manifest_path':str(upload_manifests[-1]) if upload_manifests else None, 'latest_manifest':latest_manifest, 'latest_upload_manifest':latest_upload, } if ws_files: result['latest_ws']=summarize_gz(ws_files[-1]) if rest_files: result['latest_rest']=summarize_gz(rest_files[-1]) if check_path: p=Path(check_path) result['specific_file']={'path':check_path,'exists':p.exists()} if p.exists(): result['specific_file'].update(summarize_gz(p)) print(json.dumps(result, sort_keys=True)) PY_SUMMARY summarize_pod() { local pod="$1" local check_path="${2:-}" kubectl -n "$NAMESPACE" exec "$pod" -- env RAW_DIR="$RAW_DIR" MANIFEST_DIR="$MANIFEST_DIR" CHECK_PATH="$check_path" python3 -c "$(cat "$SUMMARY_PY")" } WAIT_PY="${TMPDIR}/wait_condition.py" cat >"${WAIT_PY}" <<'PY_WAIT' import json, sys mode=sys.argv[1] old_run=sys.argv[2] if len(sys.argv) > 2 else '' o=json.loads(sys.stdin.read()) manifest=o.get('latest_manifest') or {} counters=manifest.get('counters') or {} if mode == 'initial': if counters.get('websocket_message_count',0) > 0 and counters.get('rest_success_count',0) > 0: raise SystemExit(0) elif mode == 'post_restart': if manifest.get('run_id') and manifest.get('run_id') != old_run and counters.get('websocket_message_count',0) > 0: raise SystemExit(0) raise SystemExit(1) PY_WAIT initial_json="" end=$((SECONDS + WAIT_SECONDS)) while [[ $SECONDS -lt $end ]]; do initial_json="$(summarize_pod "$WS_POD")" if python3 "$WAIT_PY" initial "" <<<"$initial_json"; then break fi sleep 15 done if ! python3 "$WAIT_PY" initial "" <<<"$initial_json"; then write_blocked "BLOCKED_WS_RECORDER_RUNTIME" "websocket recorder did not expose initial websocket and REST counters before timeout" exit 1 fi old_run_id="$(python3 -c 'import json,sys; o=json.loads(sys.stdin.read()); print((o.get("latest_manifest") or {}).get("run_id") or "")' <<<"$initial_json")" old_ws_path="$(python3 -c 'import json,sys; o=json.loads(sys.stdin.read()); print((o.get("latest_ws") or {}).get("path") or "")' <<<"$initial_json")" old_ws_sha="$(python3 -c 'import json,sys; o=json.loads(sys.stdin.read()); print((o.get("latest_ws") or {}).get("sha256") or "")' <<<"$initial_json")" old_ws_rows="$(python3 -c 'import json,sys; o=json.loads(sys.stdin.read()); print((o.get("latest_ws") or {}).get("rows") or 0)' <<<"$initial_json")" kubectl -n "$NAMESPACE" delete pod "$WS_POD" --wait=true >/dev/null kubectl -n "$NAMESPACE" rollout status "deployment/${WS_DEPLOYMENT}" --timeout=300s >/dev/null NEW_WS_POD="$(pod_for_deployment "$WS_DEPLOYMENT")" if [[ -z "$NEW_WS_POD" ]]; then write_blocked "BLOCKED_WS_RECORDER_RUNTIME" "websocket pod did not return after restart" exit 1 fi restart_json="$(summarize_pod "$NEW_WS_POD" "$old_ws_path")" post_json="" end=$((SECONDS + WAIT_SECONDS)) while [[ $SECONDS -lt $end ]]; do post_json="$(summarize_pod "$NEW_WS_POD" "$old_ws_path")" if python3 "$WAIT_PY" post_restart "$old_run_id" <<<"$post_json"; then break fi sleep 15 done if ! python3 "$WAIT_PY" post_restart "$old_run_id" <<<"$post_json"; then write_blocked "BLOCKED_WS_RECORDER_RUNTIME" "new websocket pod did not write post-restart manifest evidence before timeout" exit 1 fi UPLOADER_IMAGE="$(kubectl -n "$NAMESPACE" get cronjob "$UPLOADER_CRONJOB" -o jsonpath='{.spec.jobTemplate.spec.template.spec.containers[0].image}')" JOB_NAME="orderbooks-ws-smoke-upload-${RUN_ID,,}" JOB_NAME="${JOB_NAME//_/-}" cat >"${TMPDIR}/upload-job.yaml" </dev/null kubectl -n "$NAMESPACE" wait --for=condition=Complete --timeout=900s "job/${JOB_NAME}" >/dev/null || true JOB_STATUS="$(kubectl -n "$NAMESPACE" get job "$JOB_NAME" -o jsonpath='{.status.conditions[-1:].type}' 2>/dev/null || true)" JOB_LOG_TAIL="$(kubectl -n "$NAMESPACE" logs "job/${JOB_NAME}" --tail=80 2>/dev/null || true)" upload_json="$(summarize_pod "$NEW_WS_POD" "$old_ws_path")" REST_IMAGE_AFTER="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.spec.template.spec.containers[0].image}')" REST_READY_AFTER="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.status.readyReplicas}/{.spec.replicas}')" WRITE_PY="${TMPDIR}/write_evidence.py" cat >"${WRITE_PY}" <<'PY_WRITE' import datetime as dt, json, sys from pathlib import Path (output_path, namespace, ws_deployment, rest_deployment, uploader_cronjob, rest_image_before, rest_ready_before, rest_image_after, rest_ready_after, ws_pod_before, ws_pod_after, old_ws_path, old_ws_sha, old_ws_rows, job_name, job_status, production_min_age, smoke_retention_days, uploader_image) = sys.argv[1:20] text=sys.stdin.read() parts=text.split('\n---PART---\n') initial=json.loads(parts[0]) restart=json.loads(parts[1]) post=json.loads(parts[2]) upload=json.loads(parts[3]) job_log_tail=parts[4] reasons=[] old_ws_rows=int(old_ws_rows or 0) old_file_preexisting=bool(old_ws_path) if old_file_preexisting: specific=(post.get('specific_file') or {}) if not specific.get('exists') or specific.get('sha256') != old_ws_sha or int(specific.get('rows') or 0) != old_ws_rows: reasons.append('pre-existing closed websocket file changed or failed parse after restart') else: latest_after=(restart.get('latest_ws') or post.get('latest_ws') or {}) if int(latest_after.get('rows') or 0) <= 0: reasons.append('SIGTERM did not produce a parseable closed websocket archive') post_manifest=post.get('latest_manifest') or {} initial_manifest=initial.get('latest_manifest') or {} post_counters=post_manifest.get('counters') or {} if not post_manifest.get('run_id') or post_manifest.get('run_id') == initial_manifest.get('run_id'): reasons.append('post-restart recorder manifest did not come from a new run') if int(post_counters.get('websocket_message_count') or 0) <= 0: reasons.append('post-restart recorder did not write websocket message evidence') if not upload.get('latest_ws') or int((upload.get('latest_ws') or {}).get('rows') or 0) <= 0: reasons.append('websocket gzip evidence missing or empty') if not upload.get('latest_rest') or int((upload.get('latest_rest') or {}).get('rows') or 0) <= 0: reasons.append('REST checkpoint gzip evidence missing or empty') if rest_image_before != rest_image_after: reasons.append('REST collector image changed') if rest_ready_after != rest_ready_before: reasons.append('REST collector readiness changed') if job_status != 'Complete': reasons.append('smoke uploader job did not complete') upload_manifest=upload.get('latest_upload_manifest') or {} verified_files=upload_manifest.get('verified_files') or [] skipped_files=upload_manifest.get('skipped_files') or [] deleted_files=upload_manifest.get('deleted_local_files') or [] retained_files=upload_manifest.get('retained_local_files') or [] verified_paths={item.get('relative_path') for item in verified_files} verified_ws_or_rest=[item for item in verified_files if 'polymarket/ws_raw/' in str(item.get('relative_path')) or 'polymarket/rest_checkpoints/' in str(item.get('relative_path'))] open_count=max(initial.get('open_or_temp_file_count') or 0, post.get('open_or_temp_file_count') or 0, upload.get('open_or_temp_file_count') or 0) skipped_open=[item for item in skipped_files if item.get('reason') == 'open_or_temporary_file'] unsafe_deletes=[item for item in deleted_files if item.get('relative_path') not in verified_paths] if upload_manifest.get('gate_status') != 'PASS' or upload_manifest.get('operation_status') != 'UPLOAD_VERIFIED': reasons.append('upload manifest did not prove verified upload') if int((upload_manifest.get('counts') or {}).get('verified') or 0) <= 0: reasons.append('upload manifest verified count was zero') if not verified_ws_or_rest: reasons.append('upload manifest did not verify websocket recorder raw/checkpoint files') if open_count > 0 and not skipped_open: reasons.append('open/temp files existed but were not recorded as skipped') if unsafe_deletes: reasons.append('cleanup deleted files not present in verified set') gate='WS_RECORDER_K8S_SMOKE_PASS' if not reasons else ('BLOCKED_WS_RECORDER_UPLOAD_OR_RETENTION' if any('upload' in r or 'cleanup' in r or 'open/temp' in r for r in reasons) else 'BLOCKED_WS_RECORDER_RUNTIME') manifest={ 'schema_name':'k8s_ws_runtime_smoke', 'schema_version':1, 'written_at_utc':dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00','Z'), 'gate_status':gate, 'namespace':namespace, 'deployments':{'ws':ws_deployment,'rest':rest_deployment}, 'uploader_cronjob':uploader_cronjob, 'uploader_image':uploader_image, 'pods':{'ws_before':ws_pod_before,'ws_after':ws_pod_after}, 'rest_collector':{'image_before':rest_image_before,'ready_before':rest_ready_before,'image_after':rest_image_after,'ready_after':rest_ready_after,'unchanged':rest_image_before == rest_image_after and rest_ready_before == rest_ready_after}, 'restart_check':{'old_file_preexisting':old_file_preexisting,'old_ws_file':{'path':old_ws_path,'sha256':old_ws_sha,'rows':old_ws_rows},'restart_summary':restart.get('specific_file')}, 'initial':initial, 'post_restart':post, 'upload':upload, 'uploader_job':{'name':job_name,'status':job_status,'log_tail':job_log_tail[-4000:],'production_min_age_seconds':int(production_min_age),'smoke_min_age_seconds':0,'smoke_retention_days':int(smoke_retention_days)}, 'upload_manifest_summary':{'path':upload.get('latest_upload_manifest_path'),'gate_status':upload_manifest.get('gate_status'),'operation_status':upload_manifest.get('operation_status'),'counts':upload_manifest.get('counts'),'verified_ws_or_rest_count':len(verified_ws_or_rest),'skipped_open_or_temp_count':len(skipped_open),'deleted_count':len(deleted_files),'retained_count':len(retained_files),'unsafe_delete_count':len(unsafe_deletes)}, 'reasons':reasons, 'production_ready':False, } path=Path(output_path) path.write_text(json.dumps(manifest, indent=2, sort_keys=True)+'\n') print(json.dumps({'gate_status':gate,'evidence_path':str(path),'reasons':reasons}, indent=2, sort_keys=True)) raise SystemExit(0 if gate == 'WS_RECORDER_K8S_SMOKE_PASS' else 1) PY_WRITE printf '%s\n---PART---\n%s\n---PART---\n%s\n---PART---\n%s\n---PART---\n%s' "$initial_json" "$restart_json" "$post_json" "$upload_json" "$JOB_LOG_TAIL" | python3 "$WRITE_PY" "$OUTPUT_PATH" "$NAMESPACE" "$WS_DEPLOYMENT" "$REST_DEPLOYMENT" "$UPLOADER_CRONJOB" "$REST_IMAGE_BEFORE" "$REST_READY_BEFORE" "$REST_IMAGE_AFTER" "$REST_READY_AFTER" "$WS_POD" "$NEW_WS_POD" "$old_ws_path" "$old_ws_sha" "$old_ws_rows" "$JOB_NAME" "$JOB_STATUS" "$UPLOAD_MIN_AGE_SECONDS" "$SMOKE_UPLOAD_RETENTION_DAYS" "$UPLOADER_IMAGE"