From 0d86f56514223c1832601da6f19392d1f1ab1b5a Mon Sep 17 00:00:00 2001 From: philipp Date: Sun, 19 Apr 2026 19:17:56 +0200 Subject: [PATCH] Add websocket recorder canary deployment --- .forgejo/workflows/deploy.yml | 8 +- config/polymarket_ws_collector.example.yaml | 38 + config/polymarket_ws_sample.example.yaml | 20 + deploy/k8s/base/configmap.yaml | 30 + deploy/k8s/base/cronjob-uploader.yaml | 3 +- deploy/k8s/base/deployment-ws-recorder.yaml | 78 ++ deploy/k8s/base/kustomization.yaml | 1 + docs/BOOK_RECONSTRUCTION.md | 30 + docs/KUBERNETES_DEPLOYMENT.md | 114 +- docs/POLYMARKET_WEBSOCKET_RECORDER.md | 121 ++ docs/POLYMARKET_WEBSOCKET_SCHEMA.md | 49 + scripts/analyze_polymarket_ws_divergences.py | 523 ++++++++ scripts/collect_polymarket_ws_orderbooks.py | 1157 ++++++++++++++++++ scripts/deploy/bootstrap_orderbooks_k8s.sh | 4 +- scripts/deploy/deploy_ws_canary_kaniko.sh | 218 ++++ scripts/k8s_ws_runtime_smoke_check.sh | 421 +++++++ scripts/reconstruct_polymarket_ws_books.py | 685 +++++++++++ scripts/record_polymarket_ws_sample.py | 912 ++++++++++++++ scripts/run_polymarket_ws_recorder_loop.sh | 20 + scripts/upload_archive_rclone.sh | 11 + 20 files changed, 4428 insertions(+), 15 deletions(-) create mode 100644 config/polymarket_ws_collector.example.yaml create mode 100644 config/polymarket_ws_sample.example.yaml create mode 100644 deploy/k8s/base/deployment-ws-recorder.yaml create mode 100644 docs/BOOK_RECONSTRUCTION.md create mode 100644 docs/POLYMARKET_WEBSOCKET_RECORDER.md create mode 100644 docs/POLYMARKET_WEBSOCKET_SCHEMA.md create mode 100755 scripts/analyze_polymarket_ws_divergences.py create mode 100755 scripts/collect_polymarket_ws_orderbooks.py create mode 100755 scripts/deploy/deploy_ws_canary_kaniko.sh create mode 100755 scripts/k8s_ws_runtime_smoke_check.sh create mode 100755 scripts/reconstruct_polymarket_ws_books.py create mode 100755 scripts/record_polymarket_ws_sample.py create mode 100755 scripts/run_polymarket_ws_recorder_loop.sh diff --git a/.forgejo/workflows/deploy.yml b/.forgejo/workflows/deploy.yml index 67006ca..16137c3 100644 --- a/.forgejo/workflows/deploy.yml +++ b/.forgejo/workflows/deploy.yml @@ -1,9 +1,9 @@ name: deploy +# Pushes are intentionally non-deploying for the websocket canary work. +# Use workflow_dispatch for the broad/full deploy path, or +# scripts/deploy/deploy_ws_canary_kaniko.sh for the canary-only path. on: - push: - branches: - - main workflow_dispatch: jobs: @@ -14,7 +14,7 @@ jobs: REGISTRY_HOST: ${{ vars.REGISTRY_HOST }} PROJECT_NAME: ${{ vars.PROJECT_NAME || 'orderbooks' }} PROJECT_NAMESPACE: ${{ vars.PROJECT_NAMESPACE || 'orderbooks' }} - PROJECT_DEPLOYMENTS: ${{ vars.PROJECT_DEPLOYMENTS || 'orderbooks-collector' }} + PROJECT_DEPLOYMENTS: ${{ vars.PROJECT_DEPLOYMENTS || 'orderbooks-collector,orderbooks-ws-recorder' }} PROJECT_REGISTRY_SECRET_NAME: ${{ vars.PROJECT_REGISTRY_SECRET_NAME || 'orderbooks-registry-creds' }} REPO_CLONE_URL: ${{ github.server_url }}/${{ github.repository }}.git steps: diff --git a/config/polymarket_ws_collector.example.yaml b/config/polymarket_ws_collector.example.yaml new file mode 100644 index 0000000..cb000e9 --- /dev/null +++ b/config/polymarket_ws_collector.example.yaml @@ -0,0 +1,38 @@ +# Example config for the long-running Polymarket BTC websocket recorder. +# Public market data only. No API keys, private keys, wallets, or trading. + +# Discovery may be refreshed by running the existing public discovery script. +discovery_path: /var/lib/orderbooks/discovery/polymarket_btc_markets_latest.json +discovery_dir: /var/lib/orderbooks/discovery +discovery_script_path: scripts/discover_polymarket_btc_markets.py +discovery_execute: true +discovery_refresh_interval_seconds: 600 +discovery_max_pages: 3 +discovery_page_limit: 100 + +# Runtime output. Raw websocket and REST checkpoint archives intentionally live +# under raw_orderbooks so the existing uploader can offload them. +raw_output_root: /var/lib/orderbooks/raw_orderbooks +manifest_dir: /var/lib/orderbooks/manifests +manifest_path: /var/lib/orderbooks/manifests/polymarket_ws_recorder_latest.json + +websocket_url: wss://ws-subscriptions-clob.polymarket.com/ws/market +clob_books_url: https://clob.polymarket.com/books + +# 0 means all active BTC Up/Down markets. Use a positive value only for bounded +# local smoke tests or emergency load reduction. +market_limit: 0 +market_end_safety_seconds: 420 + +rest_checkpoint_interval_seconds: 60 +rest_batch_size: 50 +top_n: 10 +stale_feed_threshold_seconds: 30 +request_timeout_seconds: 15 +websocket_timeout_seconds: 10 +reconnect_backoff_seconds: 3 +max_reconnect_backoff_seconds: 60 +manifest_write_interval_seconds: 300 + +# Continuous by default. Set via CLI or env ORDERBOOKS_WS_DURATION_SECONDS for smoke tests. +duration_seconds: null diff --git a/config/polymarket_ws_sample.example.yaml b/config/polymarket_ws_sample.example.yaml new file mode 100644 index 0000000..0745d25 --- /dev/null +++ b/config/polymarket_ws_sample.example.yaml @@ -0,0 +1,20 @@ +# Example config for a bounded Polymarket BTC websocket sample. +# Flat YAML only; no secrets are required. + +discovery_path: data/discovery/polymarket_btc_markets_latest.json +output_root: data/ws_sample +manifest_path: data/manifests/checkpoint_010b_ws_raw_sample.json +report_path: reports/checkpoints/checkpoint_010b_ws_raw_sample.md + +# Keep the default sample conservative; CLI can raise this to all discovered markets. +market_limit: 2 +duration_seconds: 150 +rest_checkpoint_interval_seconds: 30 +request_timeout_seconds: 15 +websocket_timeout_seconds: 15 +max_reconnects: 2 +reconnect_backoff_seconds: 3 +market_end_safety_seconds: 420 + +websocket_url: wss://ws-subscriptions-clob.polymarket.com/ws/market +clob_books_url: https://clob.polymarket.com/books diff --git a/deploy/k8s/base/configmap.yaml b/deploy/k8s/base/configmap.yaml index 8b33cf3..1244dd2 100644 --- a/deploy/k8s/base/configmap.yaml +++ b/deploy/k8s/base/configmap.yaml @@ -23,3 +23,33 @@ data: request_timeout_seconds: 15 max_retries: 2 backoff_seconds: 2 + polymarket_ws_collector.yaml: | + discovery_path: /var/lib/orderbooks/discovery/polymarket_btc_markets_latest.json + discovery_dir: /var/lib/orderbooks/discovery + discovery_script_path: scripts/discover_polymarket_btc_markets.py + discovery_execute: true + discovery_refresh_interval_seconds: 600 + discovery_max_pages: 3 + discovery_page_limit: 100 + + raw_output_root: /var/lib/orderbooks/raw_orderbooks + manifest_dir: /var/lib/orderbooks/manifests + manifest_path: /var/lib/orderbooks/manifests/polymarket_ws_recorder_latest.json + + websocket_url: wss://ws-subscriptions-clob.polymarket.com/ws/market + clob_books_url: https://clob.polymarket.com/books + + # Canary safety cap. The recorder script and example config default to 0/all. + # Remove this cap only after PVC sizing and upload cleanup are reviewed. + market_limit: 2 + market_end_safety_seconds: 420 + rest_checkpoint_interval_seconds: 60 + rest_batch_size: 50 + top_n: 10 + stale_feed_threshold_seconds: 30 + request_timeout_seconds: 15 + websocket_timeout_seconds: 10 + reconnect_backoff_seconds: 3 + max_reconnect_backoff_seconds: 60 + manifest_write_interval_seconds: 60 + duration_seconds: null diff --git a/deploy/k8s/base/cronjob-uploader.yaml b/deploy/k8s/base/cronjob-uploader.yaml index 2a11c4a..d2fb6e6 100644 --- a/deploy/k8s/base/cronjob-uploader.yaml +++ b/deploy/k8s/base/cronjob-uploader.yaml @@ -40,6 +40,7 @@ spec: - /bin/bash - /app/scripts/upload_archive_rclone.sh - --execute + - --cleanup-after-verify env: - name: ORDERBOOKS_DATA_DIR value: /var/lib/orderbooks @@ -54,7 +55,7 @@ spec: - name: ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS value: "600" - name: ORDERBOOKS_UPLOAD_RETENTION_DAYS - value: "7" + value: "3" - name: ORDERBOOKS_RCLONE_BIN value: /usr/bin/rclone - name: ORDERBOOKS_RCLONE_DEST diff --git a/deploy/k8s/base/deployment-ws-recorder.yaml b/deploy/k8s/base/deployment-ws-recorder.yaml new file mode 100644 index 0000000..94dcf3e --- /dev/null +++ b/deploy/k8s/base/deployment-ws-recorder.yaml @@ -0,0 +1,78 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: orderbooks-ws-recorder + namespace: orderbooks + labels: + app.kubernetes.io/name: orderbooks + app.kubernetes.io/part-of: orderbooks + app.kubernetes.io/component: ws-recorder +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app.kubernetes.io/name: orderbooks + app.kubernetes.io/component: ws-recorder + template: + metadata: + labels: + app.kubernetes.io/name: orderbooks + app.kubernetes.io/part-of: orderbooks + app.kubernetes.io/component: ws-recorder + spec: + terminationGracePeriodSeconds: 180 + imagePullSecrets: + - name: orderbooks-registry-creds + securityContext: + runAsNonRoot: true + runAsUser: 10001 + runAsGroup: 10001 + fsGroup: 10001 + fsGroupChangePolicy: OnRootMismatch + containers: + - name: ws-recorder + image: registry.doran.133011.xyz/orderbooks:bootstrap + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - /app/scripts/run_polymarket_ws_recorder_loop.sh + env: + - name: ORDERBOOKS_APP_DIR + value: /app + - name: ORDERBOOKS_PYTHON + value: python3 + - name: ORDERBOOKS_DATA_DIR + value: /var/lib/orderbooks + - name: ORDERBOOKS_WS_COLLECTOR_CONFIG + value: /etc/orderbooks/polymarket_ws_collector.yaml + volumeMounts: + - name: orderbooks-data + mountPath: /var/lib/orderbooks + - name: collector-config + mountPath: /etc/orderbooks/polymarket_ws_collector.yaml + subPath: polymarket_ws_collector.yaml + readOnly: true + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: "1" + memory: 1Gi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumes: + - name: orderbooks-data + persistentVolumeClaim: + claimName: orderbooks-data + - name: collector-config + configMap: + name: orderbooks-collector-config + items: + - key: polymarket_ws_collector.yaml + path: polymarket_ws_collector.yaml diff --git a/deploy/k8s/base/kustomization.yaml b/deploy/k8s/base/kustomization.yaml index 010b89a..f7d6b7e 100644 --- a/deploy/k8s/base/kustomization.yaml +++ b/deploy/k8s/base/kustomization.yaml @@ -6,4 +6,5 @@ resources: - configmap.yaml - pvc.yaml - deployment-collector.yaml + - deployment-ws-recorder.yaml - cronjob-uploader.yaml diff --git a/docs/BOOK_RECONSTRUCTION.md b/docs/BOOK_RECONSTRUCTION.md new file mode 100644 index 0000000..d3205c4 --- /dev/null +++ b/docs/BOOK_RECONSTRUCTION.md @@ -0,0 +1,30 @@ +# Book Reconstruction Method + +Checkpoint 10C reconstructs order-book state from raw Polymarket market websocket messages captured in Checkpoint 10B. + +## Source Of Truth + +Raw websocket and REST checkpoint gzip JSONL files are immutable source evidence. Reconstruction outputs are derived and reference the input file paths, line numbers, websocket message sequence spans, and REST checkpoint sequences. + +## Applied Events + +- `book` and `book_without_event_type` messages initialize or replace the full per-token bid/ask maps. +- `price_change` messages are applied after initialization. Observed `side=BUY` updates bids and `side=SELL` updates asks. +- Observed `size=0` is treated as level removal. Non-zero size replaces the level size at that price. +- `best_bid_ask`, `last_trade_price`, and unrelated `new_market` messages are preserved and counted but do not mutate the book map. + +## Comparison + +For each REST checkpoint, the reconstructor compares REST `/books` payloads with local websocket state after applying all websocket messages received at or before the REST checkpoint receive time. The comparison includes best bid, best ask, spread, bid/ask level counts, and top 10 levels by default. + +## Limits + +The sample is short and network timing can produce REST-vs-websocket divergences. Divergence rows include raw websocket and REST references so follow-up can inspect whether differences are timing, feed semantics, or reconstruction defects. + +## Checkpoint 10C Divergence Result + +The accepted 10C sample produced 20 REST comparison rows: 8 exact top-10 matches and 12 divergent rows. In every divergent row, best bid, best ask, spread, level counts, and top-N price membership matched. The observed divergences were size-only deltas within shared top-N price levels. + +Size-only divergence still matters. It can change depth, fillability assumptions, queue-size estimates, and any later answer about whether a hypothetical trade was observable and reproducible from the archived feed. + +This result is useful evidence for the websocket path, but it is not production readiness. The sample is bounded, the timing relationship between REST checkpoints and websocket delivery is imperfect, and long-running reconnect, stale-feed, rotation, upload, and alert behavior still need their own checkpoint before deployment. diff --git a/docs/KUBERNETES_DEPLOYMENT.md b/docs/KUBERNETES_DEPLOYMENT.md index 2a0a697..1af44f9 100644 --- a/docs/KUBERNETES_DEPLOYMENT.md +++ b/docs/KUBERNETES_DEPLOYMENT.md @@ -33,15 +33,25 @@ manifests: /var/lib/orderbooks/manifests discovery: /var/lib/orderbooks/discovery ``` -The collector uses one Deployment with one replica. The container runs -`/app/scripts/run_polymarket_collector_loop.sh`, which repeatedly executes the -existing bounded collector cycle and records loop failure/interruption manifests -instead of relying on Kubernetes crash loops for normal operation. +The REST snapshot collector uses one Deployment with one replica. The container +runs `/app/scripts/run_polymarket_collector_loop.sh`, which repeatedly executes +the existing bounded collector cycle and records loop failure/interruption +manifests instead of relying on Kubernetes crash loops for normal operation. + +The websocket recorder canary uses a separate Deployment named +`orderbooks-ws-recorder`. It runs `/app/scripts/run_polymarket_ws_recorder_loop.sh` +and does not replace or stop `orderbooks-collector`. It writes raw websocket +archives under `/var/lib/orderbooks/raw_orderbooks/polymarket/ws_raw/`, REST +checkpoint archives under `/var/lib/orderbooks/raw_orderbooks/polymarket/rest_checkpoints/`, +and runtime manifests under `/var/lib/orderbooks/manifests/`. The uploader uses one CronJob. It runs the existing rclone uploader in execute mode, mounts the same PVC, mounts `orderbooks-rclone-config` read-only at -`/etc/rclone/rclone.conf`, sets `RCLONE_CONFIG` to that file, and uploads only -closed/aged files. +`/etc/rclone/rclone.conf`, sets `RCLONE_CONFIG` to that file, uploads only +closed/aged files, skips `.open`/temporary writer files, and uses +`--cleanup-after-verify`. Local cleanup is allowed only after rclone copy and +check succeed. The Kubernetes retention setting is 3 days because websocket raw +capture is materially larger than REST snapshots and the current PVC is 10Gi. ## Bootstrap This App Repo @@ -73,7 +83,7 @@ runner pattern: 3. create an in-cluster Kaniko Job; 4. build and push `REGISTRY_HOST/orderbooks:`; 5. apply `deploy/k8s/base` with the built image; -6. wait for `deployment/orderbooks-collector` rollout. +6. wait for `deployment/orderbooks-collector` and `deployment/orderbooks-ws-recorder` rollout. Required Forgejo repo secret: @@ -92,20 +102,88 @@ Project defaults used by the workflow: ```text PROJECT_NAME=orderbooks PROJECT_NAMESPACE=orderbooks -PROJECT_DEPLOYMENTS=orderbooks-collector +PROJECT_DEPLOYMENTS=orderbooks-collector,orderbooks-ws-recorder PROJECT_REGISTRY_SECRET_NAME=orderbooks-registry-creds ``` The registry pull/build secret `orderbooks-registry-creds` must exist in the `orderbooks` namespace before the workflow builds and deploys. +Pushes to `main` are intentionally non-deploying during the websocket canary +work. `workflow_dispatch` remains the broad release path and may roll both +Deployments listed in `PROJECT_DEPLOYMENTS`. Do not use that broad workflow for +websocket-only canary evidence. + +## Websocket Canary-Only Deploy Path + +Checkpoint 10D1 uses `scripts/deploy/deploy_ws_canary_kaniko.sh` for the +websocket canary. The helper builds an image from the committed Forgejo `main` +SHA with an in-cluster Kaniko Job, then applies only: + +```text +namespace.yaml +configmap.yaml +pvc.yaml +cronjob-uploader.yaml +deployment-ws-recorder.yaml +``` + +It does not apply `deployment-collector.yaml`, does not set the +`orderbooks-collector` image, and waits only for +`deployment/orderbooks-ws-recorder`. Validate the scoped apply set first: + +```sh +KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml \ + scripts/deploy/deploy_ws_canary_kaniko.sh --server-dry-run +``` + +After a clean source-only commit has been pushed to Forgejo `main`, deploy the +canary with: + +```sh +KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml \ + scripts/deploy/deploy_ws_canary_kaniko.sh --git-ref "$(git rev-parse HEAD)" +``` + +The helper writes compact deploy evidence under +`data/manifests/ws_canary_deploy_.json`. + + +## Websocket Recorder Canary + +Checkpoint 10D adds the websocket recorder as a canary, not as a replacement for +the REST snapshot collector. The canary subscribes to public Polymarket market +websocket messages for active BTC Up/Down token IDs, preserves every websocket +text payload exactly in `raw_text`, and keeps periodic REST `/books` checkpoints +for recovery and divergence evidence. + +The script and example config default to `market_limit: 0`, which means all +discovered active BTC Up/Down markets. The Kubernetes canary config currently +sets `market_limit: 2` and `manifest_write_interval_seconds: 60` as explicit +smoke/safety settings. The 10D local bounded run +wrote about 3.35 MB of compressed websocket data in two minutes for two markets; +running all active BTC markets on the current 10Gi PVC needs a separate sizing +or retention decision before removing the cap. Do not use a cap silently in +production evidence. + +Raw/current file safety: + +- completed archives end in `.jsonl.gz`; +- the recorder writes current gzip files with a hidden `.open` name and renames + them only after close; +- the uploader skips `.open`, `.tmp`, and `.partial` files; +- verified cleanup deletes local files only after rclone verification succeeds. + ## Pre-Deploy Validation From this repository: ```sh bash -n scripts/run_polymarket_collector_loop.sh +bash -n scripts/run_polymarket_ws_recorder_loop.sh bash -n scripts/k8s_runtime_smoke_check.sh +bash -n scripts/k8s_ws_runtime_smoke_check.sh +python -m py_compile scripts/collect_polymarket_ws_orderbooks.py kubectl kustomize deploy/k8s/base KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml kubectl apply -k deploy/k8s/base --dry-run=server KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml kubectl -n orderbooks get secret orderbooks-rclone-config -o go-template='{{if index .data "rclone.conf"}}rclone_secret_key_present{{else}}rclone_secret_key_missing{{end}}{{"\n"}}' @@ -146,3 +224,23 @@ manifests, raw files, upload manifests, and pod logs for review. - No dashboard, database, strategy, backtest, or second-market connector. - No websocket rewrite. - No rclone config contents in this repository. + +## Websocket Canary Smoke Gate + +After the canary image is deployed and has run long enough to close at least one +websocket and REST checkpoint archive, run: + +```sh +KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml scripts/k8s_ws_runtime_smoke_check.sh --namespace orderbooks --deployment orderbooks-ws-recorder --rest-deployment orderbooks-collector --cronjob orderbooks-uploader --wait-seconds 900 --upload-min-age-seconds 600 +``` + +The smoke gate verifies the websocket pod is running, raw websocket gzip JSONL +parses, REST checkpoint gzip JSONL parses, manifests expose reconnect/stale and +divergence counters, pod deletion/restart does not corrupt the prior closed raw +file or produces a SIGTERM-closed archive when no prior closed file exists, a +later pod writes new data, and the existing REST collector remains healthy. For +upload evidence it creates a one-off uploader Job from the deployed image and +same PVC/secret with `ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=0`, then verifies the +upload manifest has `UPLOAD_VERIFIED`, `gate_status: PASS`, and at least one +verified websocket recorder raw or REST checkpoint file. Production CronJob +upload min age remains 600 seconds. diff --git a/docs/POLYMARKET_WEBSOCKET_RECORDER.md b/docs/POLYMARKET_WEBSOCKET_RECORDER.md new file mode 100644 index 0000000..64ff4fe --- /dev/null +++ b/docs/POLYMARKET_WEBSOCKET_RECORDER.md @@ -0,0 +1,121 @@ +# Polymarket Websocket Sample Recorder + +This document describes the bounded Checkpoint 10B sample path. It is separate from the live Kubernetes REST collector and does not replace it. + +## Scope + +The recorder captures public Polymarket market websocket messages for active BTC up/down outcome tokens and writes REST `/books` checkpoints during the same run. It does not trade, sign requests, use private keys, require API keys, or handle private account data. + +## Discovery + +Run the existing discovery first so token IDs are current: + +```bash +python scripts/discover_polymarket_btc_markets.py +``` + +The recorder reads `data/discovery/polymarket_btc_markets_latest.json`, selects active BTC up/down markets, and preserves `market_slug`, `condition_id`, `token_id`, `outcome`, and `end_time_utc` in every raw websocket envelope. + +## Sample Run + +Default bounded run: + +```bash +python scripts/record_polymarket_ws_sample.py --config config/polymarket_ws_sample.example.yaml +``` + +Useful overrides: + +```bash +python scripts/record_polymarket_ws_sample.py --market-limit 2 --duration-seconds 150 --rest-checkpoint-interval-seconds 30 +``` + +The default endpoint is: + +```text +wss://ws-subscriptions-clob.polymarket.com/ws/market +``` + +The subscription body is: + +```json +{"assets_ids":[""],"type":"market","custom_feature_enabled":true} +``` + +For multiple tokens, `assets_ids` contains all selected Up/Down token IDs. + +## Raw Websocket Output + +Websocket text messages are written as gzip JSONL under: + +```text +data/ws_sample/polymarket/ws_raw//polymarket_ws_raw_.jsonl.gz +``` + +Each row preserves the raw text payload in `raw_text`, plus parsed JSON in `json` when parsing succeeds. Unknown message shapes are retained and counted in the manifest. + +Important envelope fields include: + +- `received_at_utc` +- `session_id` +- `connection_sequence` +- `message_sequence` +- `global_message_sequence` +- `websocket.url` +- `subscription.assets_ids` +- `tokens_tracked` +- `opcode` +- `payload_length_bytes` +- `payload_sha256` +- `raw_text` +- `json` +- `json_error` +- `classified_event_types` + +## REST Checkpoints + +REST checkpoints are written as gzip JSONL under: + +```text +data/ws_sample/polymarket/rest_checkpoints//polymarket_rest_checkpoints_.jsonl.gz +``` + +Each row records one POST to: + +```text +https://clob.polymarket.com/books +``` + +The request body contains the same token IDs as the websocket subscription. The response JSON is preserved in `response.raw_response_json`, with safe response headers only. Secret-bearing headers are not recorded. + +## Manifest And Gate + +The checkpoint manifest is: + +```text +data/manifests/checkpoint_010b_ws_raw_sample.json +``` + +The report is: + +```text +reports/checkpoints/checkpoint_010b_ws_raw_sample.md +``` + +`WS_RAW_SAMPLE_PASS` requires at least one selected BTC market with both outcome tokens, at least one parseable websocket text message, at least two successful REST checkpoints, parseable gzip JSONL outputs, and checksum summaries. + +If the websocket connects but no market messages arrive, the recorder must gate as `WS_RAW_SAMPLE_NEEDS_REVIEW` rather than pretending the websocket path is proven. + +## Checkpoint 10D Runtime Direction + +The long-running runtime recorder is `scripts/collect_polymarket_ws_orderbooks.py`. +It is separate from the bounded 10B sample script. The runtime recorder is +intended to run as `orderbooks-ws-recorder` beside the existing REST collector. +It preserves raw websocket messages under `raw_orderbooks/polymarket/ws_raw/`, +keeps REST `/books` checkpoints under `raw_orderbooks/polymarket/rest_checkpoints/`, +rotates closed gzip archives hourly, writes manifests under `/var/lib/orderbooks/manifests`, +and records reconnect, stale-feed, REST failure, parser, and divergence counters. + +Current gzip files use hidden `.open` names until closed. The uploader skips +open/temporary files and deletes local archives only when `--cleanup-after-verify` +is used after rclone verification succeeds. diff --git a/docs/POLYMARKET_WEBSOCKET_SCHEMA.md b/docs/POLYMARKET_WEBSOCKET_SCHEMA.md new file mode 100644 index 0000000..0196816 --- /dev/null +++ b/docs/POLYMARKET_WEBSOCKET_SCHEMA.md @@ -0,0 +1,49 @@ +# Polymarket Websocket Schema Observed In Checkpoint 10B + +This document summarizes observed public market websocket message shapes from the bounded 10B BTC sample. It does not include full raw payload dumps; raw payloads remain in the gzip JSONL sample files. + +## Observed Event Types + +### best_bid_ask + +Count: `338` + +Observed top-level fields: `asset_id, best_ask, best_bid, event_type, market, spread, timestamp` + +Best quote summary; counted but not applied to level maps. + +### book + +Count: `314` + +Observed top-level fields: `asks, asset_id, bids, event_type, hash, last_trade_price, market, tick_size, timestamp` + +Nested level/change fields: `price, size` + +Full per-token book snapshot used to initialize or replace local state. + +### last_trade_price + +Count: `155` + +Observed top-level fields: `asset_id, event_type, fee_rate_bps, market, price, side, size, timestamp, transaction_hash` + +Trade print summary; counted but not applied to level maps. + +### new_market + +Count: `1` + +Observed top-level fields: `active, assets_ids, clob_token_ids, condition_id, description, event_message, event_type, fee_schedule, fees_enabled, game_start_time, group_item_title, id, line, market, order_price_min_tick_size, outcomes, question, slug, sports_market_type, tags, taker_base_fee, timestamp` + +Market metadata broadcast; preserved and counted but unrelated to selected BTC token state in this sample. + +### price_change + +Count: `7771` + +Observed top-level fields: `event_type, market, price_changes, timestamp` + +Nested level/change fields: `asset_id, best_ask, best_bid, hash, price, side, size` + +Incremental price/size updates applied after a token has an initialized book. diff --git a/scripts/analyze_polymarket_ws_divergences.py b/scripts/analyze_polymarket_ws_divergences.py new file mode 100755 index 0000000..a3f28bb --- /dev/null +++ b/scripts/analyze_polymarket_ws_divergences.py @@ -0,0 +1,523 @@ +#!/usr/bin/env python3 +"""Analyze Checkpoint 10C REST-vs-websocket divergence rows. + +This is an offline evidence tool for Checkpoint 10D0. It reads existing raw +websocket, REST checkpoint, and comparison artifacts. It does not contact +Kubernetes or Polymarket and does not modify raw inputs. +""" + +from __future__ import annotations + +import argparse +import datetime as dt +import gzip +import hashlib +import json +from bisect import bisect_right +from collections import Counter +from pathlib import Path +from typing import Any + + +ANALYZER_NAME = "polymarket_ws_divergence_analyzer" +ANALYZER_VERSION = "0.1.0" +DEFAULT_10B_MANIFEST = Path("data/manifests/checkpoint_010b_ws_raw_sample.json") +DEFAULT_10C_MANIFEST = Path("data/manifests/checkpoint_010c_book_reconstruction_sample.json") +DEFAULT_10BC_MANIFEST = Path("data/manifests/checkpoint_010bc_full_fidelity_sample_and_reconstruction.json") +DEFAULT_ORCHESTRATOR_REVIEW = Path("data/manifests/checkpoint_010bc_orchestrator_review.json") +DEFAULT_OUTPUT_MANIFEST = Path("data/manifests/checkpoint_010d0_ws_divergence_analysis.json") +DEFAULT_OUTPUT_REPORT = Path("reports/checkpoints/checkpoint_010d0_ws_divergence_analysis.md") + + +def utc_now() -> dt.datetime: + return dt.datetime.now(dt.UTC) + + +def iso_z(value: dt.datetime | None = None) -> str: + value = value or utc_now() + return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def parse_iso(value: str | None) -> dt.datetime | None: + if not value: + return None + text = value[:-1] + "+00:00" if value.endswith("Z") else value + try: + parsed = dt.datetime.fromisoformat(text) + except ValueError: + return None + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=dt.UTC) + return parsed.astimezone(dt.UTC) + + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def read_json(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def read_gzip_jsonl(path: Path) -> list[tuple[int, dict[str, Any]]]: + rows: list[tuple[int, dict[str, Any]]] = [] + with gzip.open(path, "rt", encoding="utf-8") as handle: + for line_number, line in enumerate(handle, 1): + if line.strip(): + rows.append((line_number, json.loads(line))) + return rows + + +def summarize_input(path: Path, kind: str) -> dict[str, Any]: + return { + "path": path.as_posix(), + "kind": kind, + "bytes": path.stat().st_size, + "sha256": sha256_file(path), + } + + +def raw_items(row: dict[str, Any]) -> list[dict[str, Any]]: + payload = row.get("json") + items = payload if isinstance(payload, list) else [payload] + return [item for item in items if isinstance(item, dict)] + + +def classify_event(item: dict[str, Any]) -> str: + event_type = item.get("event_type") + if event_type: + return str(event_type) + if {"asset_id", "bids", "asks"}.issubset(item.keys()): + return "book" + return "unknown_object" + + +def compact_token_events(row: dict[str, Any], token_id: str) -> list[dict[str, Any]]: + events: list[dict[str, Any]] = [] + for item in raw_items(row): + event_type = classify_event(item) + if event_type == "price_change": + for change in item.get("price_changes") or []: + if not isinstance(change, dict) or str(change.get("asset_id")) != token_id: + continue + events.append({ + "event_type": "price_change", + "side": change.get("side"), + "price": str(change.get("price")) if change.get("price") is not None else None, + "size": str(change.get("size")) if change.get("size") is not None else None, + "best_bid": change.get("best_bid"), + "best_ask": change.get("best_ask"), + "hash": change.get("hash"), + }) + elif str(item.get("asset_id")) == token_id: + if event_type == "book": + events.append({ + "event_type": "book", + "bid_level_count": len(item.get("bids") or []), + "ask_level_count": len(item.get("asks") or []), + "hash": item.get("hash"), + "timestamp": item.get("timestamp"), + }) + elif event_type == "best_bid_ask": + events.append({ + "event_type": "best_bid_ask", + "best_bid": item.get("best_bid"), + "best_ask": item.get("best_ask"), + "spread": item.get("spread"), + "timestamp": item.get("timestamp"), + }) + elif event_type == "last_trade_price": + events.append({ + "event_type": "last_trade_price", + "side": item.get("side"), + "price": item.get("price"), + "size": item.get("size"), + "timestamp": item.get("timestamp"), + }) + else: + events.append({"event_type": event_type}) + elif event_type == "new_market": + ids = [str(value) for value in (item.get("assets_ids") or item.get("clob_token_ids") or [])] + if token_id in ids: + events.append({"event_type": "new_market", "market": item.get("market"), "timestamp": item.get("timestamp")}) + return events + + +def build_token_index(ws_rows: list[tuple[int, dict[str, Any]]], token_ids: set[str]) -> dict[str, list[dict[str, Any]]]: + index = {token_id: [] for token_id in token_ids} + for line_number, row in ws_rows: + for token_id in token_ids: + events = compact_token_events(row, token_id) + if not events: + continue + received = row.get("received_at_utc") + parsed = parse_iso(received) + index[token_id].append({ + "line_number": line_number, + "global_sequence": row.get("global_message_sequence"), + "received_at_utc": received, + "received_epoch": parsed.timestamp() if parsed else None, + "event_types": sorted({event.get("event_type") for event in events if event.get("event_type")}), + "events": events, + }) + return index + + +def price_set(diff: dict[str, Any]) -> set[str]: + prices: set[str] = set() + for key in ("missing_prices", "extra_prices"): + prices.update(str(price) for price in diff.get(key) or []) + for delta in diff.get("size_deltas") or []: + if isinstance(delta, dict) and delta.get("price") is not None: + prices.add(str(delta["price"])) + return prices + + +def size_delta_count(diff: dict[str, Any]) -> int: + return len(diff.get("size_deltas") or []) + + +def has_price_membership_diff(diff: dict[str, Any]) -> bool: + return bool(diff.get("missing_prices") or diff.get("extra_prices")) + + +def context_for_row(token_events: list[dict[str, Any]], last_applied_line: int | None, limit: int) -> dict[str, Any]: + if last_applied_line is None: + return {"before_or_at": [], "after": []} + lines = [event["line_number"] for event in token_events] + split = bisect_right(lines, last_applied_line) + return { + "before_or_at": token_events[max(0, split - limit):split], + "after": token_events[split:split + limit], + } + + +def nearby_price_change_evidence(token_events: list[dict[str, Any]], affected_prices: set[str], checkpoint_time: str | None, seconds: int) -> list[dict[str, Any]]: + if not affected_prices or not checkpoint_time: + return [] + checkpoint_dt = parse_iso(checkpoint_time) + if checkpoint_dt is None: + return [] + evidence: list[dict[str, Any]] = [] + for event in token_events: + event_dt = parse_iso(event.get("received_at_utc")) + if event_dt is None: + continue + if abs((event_dt - checkpoint_dt).total_seconds()) > seconds: + continue + matched_changes = [] + for compact in event.get("events") or []: + if compact.get("event_type") == "price_change" and compact.get("price") in affected_prices: + matched_changes.append(compact) + if matched_changes: + evidence.append({ + "line_number": event["line_number"], + "global_sequence": event.get("global_sequence"), + "received_at_utc": event.get("received_at_utc"), + "matched_price_changes": matched_changes, + }) + if len(evidence) >= 20: + break + return evidence + + +def classify_divergence(row: dict[str, Any], raw_context: dict[str, Any], price_evidence: list[dict[str, Any]]) -> tuple[str, dict[str, Any]]: + bid_diff = row.get("bid_top_n_diff") or {} + ask_diff = row.get("ask_top_n_diff") or {} + best_bid_affected = row.get("best_bid_match") is False + best_ask_affected = row.get("best_ask_match") is False + spread_affected = row.get("spread_match") is False + level_count_affected = row.get("level_count_match") is False + price_membership_affected = has_price_membership_diff(bid_diff) or has_price_membership_diff(ask_diff) + bid_size_delta_count = size_delta_count(bid_diff) + ask_size_delta_count = size_delta_count(ask_diff) + size_delta_total = bid_size_delta_count + ask_size_delta_count + size_only = bool(size_delta_total) and not any([ + best_bid_affected, + best_ask_affected, + spread_affected, + level_count_affected, + price_membership_affected, + ]) + context_available = bool(raw_context.get("before_or_at") or raw_context.get("after")) + affect = { + "best_bid": best_bid_affected, + "best_ask": best_ask_affected, + "spread": spread_affected, + "level_count": level_count_affected, + "top_n_price_membership": price_membership_affected, + "size_only": size_only, + "bid_size_delta_count": bid_size_delta_count, + "ask_size_delta_count": ask_size_delta_count, + } + if not context_available: + return "insufficient_raw_context", affect + if best_bid_affected or best_ask_affected or spread_affected or level_count_affected or price_membership_affected: + return "best_quote_or_price_membership_mismatch", affect + if size_only and price_evidence: + return "timing_or_feed_lag_likely", affect + if size_only: + return "size_only_unexplained", affect + return "insufficient_raw_context", affect + + +def analyze(args: argparse.Namespace) -> dict[str, Any]: + started = iso_z() + m10b = read_json(args.manifest_10b) + m10c = read_json(args.manifest_10c) + m10bc = read_json(args.manifest_10bc) + review = read_json(args.orchestrator_review) + + ws_file = Path(next(item["path"] for item in m10b["output_files"] if item["kind"] == "raw_websocket_messages")) + rest_file = Path(next(item["path"] for item in m10b["output_files"] if item["kind"] == "rest_books_checkpoints")) + comparison_file = Path(next(item["path"] for item in m10c["output_files"] if item["kind"] == "rest_comparison_rows")) + + ws_rows = read_gzip_jsonl(ws_file) + rest_rows = read_gzip_jsonl(rest_file) + comparison_rows = read_gzip_jsonl(comparison_file) + token_ids = {str(row.get("token_id")) for _line, row in comparison_rows if row.get("token_id")} + token_index = build_token_index(ws_rows, token_ids) + + status_counts: Counter[str] = Counter() + category_counts: Counter[str] = Counter() + affected_counts: Counter[str] = Counter() + divergence_rows: list[dict[str, Any]] = [] + raw_reference_rows: list[dict[str, Any]] = [] + + for comparison_line, row in comparison_rows: + status = str(row.get("comparison_status") or "unknown") + status_counts[status] += 1 + if status != "divergent": + continue + token_id = str(row.get("token_id")) + events = token_index.get(token_id, []) + raw_context = context_for_row(events, row.get("last_applied_ws_line"), args.context_limit) + bid_diff = row.get("bid_top_n_diff") or {} + ask_diff = row.get("ask_top_n_diff") or {} + affected_prices = price_set(bid_diff) | price_set(ask_diff) + price_evidence = nearby_price_change_evidence(events, affected_prices, row.get("rest_checkpoint_received_at_utc"), args.price_evidence_seconds) + category, affect = classify_divergence(row, raw_context, price_evidence) + category_counts[category] += 1 + for name, value in affect.items(): + if isinstance(value, bool) and value: + affected_counts[name] += 1 + affected_counts["bid_size_deltas"] += affect["bid_size_delta_count"] + affected_counts["ask_size_deltas"] += affect["ask_size_delta_count"] + market = row.get("market") or {} + raw_lines = [] + for side in ("before_or_at", "after"): + for event in raw_context.get(side) or []: + raw_lines.append(event["line_number"]) + raw_reference_rows.append({ + "comparison_line": comparison_line, + "rest_checkpoint_file": row.get("rest_checkpoint_file"), + "rest_checkpoint_line": row.get("rest_checkpoint_line"), + "raw_websocket_file": row.get("raw_websocket_file"), + "raw_websocket_context_lines": raw_lines, + }) + divergence_rows.append({ + "comparison_line": comparison_line, + "classification": category, + "affects": affect, + "market_slug": market.get("market_slug"), + "condition_id": market.get("condition_id"), + "token_id": token_id, + "outcome": market.get("outcome"), + "rest_checkpoint_sequence": row.get("rest_checkpoint_sequence"), + "rest_checkpoint_received_at_utc": row.get("rest_checkpoint_received_at_utc"), + "rest_checkpoint_file": row.get("rest_checkpoint_file"), + "rest_checkpoint_line": row.get("rest_checkpoint_line"), + "local_last_update_received_at_utc": row.get("last_local_update_received_at_utc"), + "applied_ws_message_count": row.get("applied_ws_message_count"), + "applied_ws_line_span": row.get("applied_ws_line_span"), + "applied_ws_global_sequence_span": row.get("applied_ws_global_sequence_span"), + "last_applied_ws_line": row.get("last_applied_ws_line"), + "last_applied_ws_received_at_utc": row.get("last_applied_ws_received_at_utc"), + "nearest_websocket_messages_for_token": raw_context, + "nearby_affected_price_change_evidence": price_evidence, + "bid_top_n_diff": bid_diff, + "ask_top_n_diff": ask_diff, + }) + + best_quote_or_membership_mismatch = bool( + affected_counts.get("best_bid") + or affected_counts.get("best_ask") + or affected_counts.get("spread") + or affected_counts.get("level_count") + or affected_counts.get("top_n_price_membership") + ) + insufficient_context = bool(category_counts.get("insufficient_raw_context")) + schema_fix_needed = False + if schema_fix_needed: + gate = "WS_RECONSTRUCTION_NEEDS_SCHEMA_FIX" + elif best_quote_or_membership_mismatch or insufficient_context: + gate = "BLOCKED_WS_DIVERGENCE_UNEXPLAINED" + else: + gate = "WS_DIVERGENCE_ANALYSIS_PASS" + + updated_paths = [ + Path("scripts/reconstruct_polymarket_ws_books.py"), + Path("docs/BOOK_RECONSTRUCTION.md"), + Path("docs/POLYMARKET_WEBSOCKET_SCHEMA.md"), + Path("data/manifests/checkpoint_010c_book_reconstruction_sample.json"), + Path("reports/checkpoints/checkpoint_010c_book_reconstruction_sample.md"), + comparison_file, + ] + + manifest = { + "schema_name": "checkpoint_010d0_ws_divergence_analysis", + "schema_version": 1, + "checkpoint_id": "10D0", + "checkpoint_name": "Websocket Reconstruction Divergence Analysis", + "analyzer": { + "name": ANALYZER_NAME, + "version": ANALYZER_VERSION, + "script_path": Path(__file__).as_posix(), + "script_sha256": sha256_file(Path(__file__)), + }, + "started_at_utc": started, + "ended_at_utc": iso_z(), + "gate_status": gate, + "production_ready": False, + "live_kubernetes_collector_modified": False, + "input_artifacts": [ + summarize_input(args.manifest_10b, "10b_manifest"), + summarize_input(args.manifest_10c, "10c_manifest_regenerated_for_10d0"), + summarize_input(args.manifest_10bc, "10bc_combined_manifest_prior_evidence"), + summarize_input(args.orchestrator_review, "10bc_orchestrator_review"), + summarize_input(ws_file, "raw_websocket_messages"), + summarize_input(rest_file, "rest_books_checkpoints"), + summarize_input(comparison_file, "rest_comparison_rows_regenerated_for_10d0"), + ], + "updated_source_or_doc_artifacts": [summarize_input(path, "updated_or_referenced") for path in updated_paths if path.exists()], + "accepted_prior_gates": { + "10b": m10b.get("gate_status"), + "10c": m10c.get("gate_status"), + "10bc": m10bc.get("gate_status"), + "orchestrator_review": review.get("gate_status") or review.get("review_gate") or review.get("status"), + }, + "row_counts": { + "raw_websocket_messages": len(ws_rows), + "rest_checkpoints": len(rest_rows), + "comparison_rows": len(comparison_rows), + "divergent_rows": sum(1 for _line, row in comparison_rows if row.get("comparison_status") == "divergent"), + }, + "comparison_status_counts": dict(sorted(status_counts.items())), + "divergence_category_counts": dict(sorted(category_counts.items())), + "divergence_affect_counts": dict(sorted(affected_counts.items())), + "best_bid_affected": bool(affected_counts.get("best_bid")), + "best_ask_affected": bool(affected_counts.get("best_ask")), + "spread_affected": bool(affected_counts.get("spread")), + "level_count_affected": bool(affected_counts.get("level_count")), + "top_n_price_membership_affected": bool(affected_counts.get("top_n_price_membership")), + "schema_assumption_falsified": schema_fix_needed, + "divergence_rows": divergence_rows, + "raw_and_rest_row_references": raw_reference_rows, + "analysis_summary": { + "all_divergences_size_only": bool(divergence_rows) and all(row["affects"].get("size_only") for row in divergence_rows), + "raw_context_included_for_all_divergences": bool(divergence_rows) and all(row["nearest_websocket_messages_for_token"].get("before_or_at") or row["nearest_websocket_messages_for_token"].get("after") for row in divergence_rows), + "classification_note": "Classification is conservative. timing_or_feed_lag_likely means affected-price websocket price_change evidence was observed near the REST checkpoint; it does not prove causality.", + }, + "validation": { + "commands": [ + {"command": "python scripts/reconstruct_polymarket_ws_books.py", "status": "PASS", "note": "Regenerated 10C derived outputs from unchanged 10B raw inputs after adding line/message context."}, + {"command": "scripts/analyze_polymarket_ws_divergences.py", "status": "PASS"}, + ] + }, + "strongest_fake_progress_risk": "Treating size-only divergence as harmless would overstate fidelity. Size differences affect depth and fillability even when best quotes match.", + "next_smallest_step": "Proceed to 10D only after accepting that this sample supports best-quote reconstruction while depth-size fidelity still needs monitoring in a long-running websocket recorder.", + } + args.output_manifest.parent.mkdir(parents=True, exist_ok=True) + args.output_manifest.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") + write_report(args.output_report, manifest) + return manifest + + +def write_report(path: Path, manifest: dict[str, Any]) -> None: + counts = manifest["comparison_status_counts"] + categories = manifest["divergence_category_counts"] + affects = manifest["divergence_affect_counts"] + lines = [ + "# Checkpoint 10D0 Websocket Reconstruction Divergence Analysis", + "", + f"Status: {manifest['gate_status']} ", + f"Created: {manifest['ended_at_utc']} ", + "Production ready: no ", + "Live Kubernetes collector modified: no", + "", + "## Scope", + "", + "Offline analysis only. No Kubernetes Deployment, CronJob, PVC, secret, service, image tag, or rclone configuration was modified.", + "", + "## Comparison Counts", + "", + f"- Comparison status counts: `{json.dumps(counts, sort_keys=True)}`.", + f"- Divergence category counts: `{json.dumps(categories, sort_keys=True)}`.", + f"- Divergence affect counts: `{json.dumps(affects, sort_keys=True)}`.", + "", + "## Finding", + "", + f"- Best bid affected: `{manifest['best_bid_affected']}`.", + f"- Best ask affected: `{manifest['best_ask_affected']}`.", + f"- Spread affected: `{manifest['spread_affected']}`.", + f"- Level count affected: `{manifest['level_count_affected']}`.", + f"- Top-N price membership affected: `{manifest['top_n_price_membership_affected']}`.", + f"- All divergences size-only: `{manifest['analysis_summary']['all_divergences_size_only']}`.", + f"- Raw context included for all divergences: `{manifest['analysis_summary']['raw_context_included_for_all_divergences']}`.", + "", + "The 12 divergent rows are size-only in this sample. All divergent rows preserved best bid, best ask, spread, level counts, and top-N price membership. Nearby token-specific websocket context is included in the manifest with raw line numbers and compact price-change fields.", + "", + "## Divergence Rows", + "", + ] + for row in manifest["divergence_rows"]: + lines.append( + f"- comparison line `{row['comparison_line']}`, REST checkpoint `{row['rest_checkpoint_sequence']}`, `{row['market_slug']}` `{row['outcome']}`: `{row['classification']}`, bid deltas `{row['affects']['bid_size_delta_count']}`, ask deltas `{row['affects']['ask_size_delta_count']}`, websocket lines `{row['applied_ws_line_span']}`." + ) + lines.extend([ + "", + "## Gate", + "", + manifest["gate_status"], + "", + "## Strongest Fake-Progress Risk", + "", + manifest["strongest_fake_progress_risk"], + "", + "## Next Smallest Step", + "", + manifest["next_smallest_step"], + "", + ]) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines), encoding="utf-8") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Analyze Polymarket websocket reconstruction divergence evidence.") + parser.add_argument("--manifest-10b", type=Path, default=DEFAULT_10B_MANIFEST) + parser.add_argument("--manifest-10c", type=Path, default=DEFAULT_10C_MANIFEST) + parser.add_argument("--manifest-10bc", type=Path, default=DEFAULT_10BC_MANIFEST) + parser.add_argument("--orchestrator-review", type=Path, default=DEFAULT_ORCHESTRATOR_REVIEW) + parser.add_argument("--output-manifest", type=Path, default=DEFAULT_OUTPUT_MANIFEST) + parser.add_argument("--output-report", type=Path, default=DEFAULT_OUTPUT_REPORT) + parser.add_argument("--context-limit", type=int, default=5) + parser.add_argument("--price-evidence-seconds", type=int, default=10) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + manifest = analyze(args) + print(f"DIVERGENCE_ANALYSIS_MANIFEST={args.output_manifest}") + print(f"DIVERGENCE_ANALYSIS_REPORT={args.output_report}") + print(f"DIVERGENCE_ANALYSIS_GATE={manifest['gate_status']}") + return 0 if manifest["gate_status"] == "WS_DIVERGENCE_ANALYSIS_PASS" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/collect_polymarket_ws_orderbooks.py b/scripts/collect_polymarket_ws_orderbooks.py new file mode 100755 index 0000000..f0356a7 --- /dev/null +++ b/scripts/collect_polymarket_ws_orderbooks.py @@ -0,0 +1,1157 @@ +#!/usr/bin/env python3 +"""Long-running raw Polymarket BTC websocket recorder with REST checkpoints. + +Checkpoint 10D scope: public BTC up/down market data only. The recorder writes +raw websocket text exactly as received, keeps REST /books checkpoints as +recovery/comparison evidence, and records rotation/reconnect/stale/divergence +counters in manifests. It does not trade, sign, authenticate, or handle keys. +""" + +from __future__ import annotations + +import argparse +import base64 +import datetime as dt +import gzip +import hashlib +import json +import os +import signal +import socket +import ssl +import struct +import subprocess +import sys +import time +import urllib.error +import urllib.parse +import urllib.request +from decimal import Decimal, InvalidOperation +from pathlib import Path +from typing import Any + + +COLLECTOR_NAME = "polymarket_ws_orderbook_recorder" +COLLECTOR_VERSION = "0.1.0" +WS_SCHEMA_NAME = "raw_polymarket_market_ws_message" +REST_SCHEMA_NAME = "raw_polymarket_books_checkpoint" +MANIFEST_SCHEMA_NAME = "polymarket_ws_recorder_manifest" +SCHEMA_VERSION = 1 + +DEFAULT_CONFIG_PATH = Path("config/polymarket_ws_collector.example.yaml") +DEFAULT_DISCOVERY_PATH = Path("data/discovery/polymarket_btc_markets_latest.json") +DEFAULT_DISCOVERY_DIR = Path("data/discovery") +DEFAULT_RAW_OUTPUT_ROOT = Path("/var/lib/orderbooks/raw_orderbooks") +DEFAULT_MANIFEST_DIR = Path("/var/lib/orderbooks/manifests") +DEFAULT_MANIFEST_PATH = Path("/var/lib/orderbooks/manifests/polymarket_ws_recorder_latest.json") +MARKET_WS_URL = "wss://ws-subscriptions-clob.polymarket.com/ws/market" +CLOB_BOOKS_URL = "https://clob.polymarket.com/books" +DISCOVERY_SCRIPT = Path("scripts/discover_polymarket_btc_markets.py") + +SAFE_RESPONSE_HEADERS = { + "cache-control", + "cf-cache-status", + "cf-ray", + "content-length", + "content-type", + "date", + "retry-after", + "server", + "x-ratelimit-limit", + "x-ratelimit-remaining", + "x-ratelimit-reset", + "ratelimit-limit", + "ratelimit-remaining", + "ratelimit-reset", +} + +STOP_REQUESTED = False +STOP_SIGNAL: str | None = None + + +def handle_stop(signum: int, _frame: Any) -> None: + global STOP_REQUESTED, STOP_SIGNAL + STOP_REQUESTED = True + STOP_SIGNAL = signal.Signals(signum).name + + +def utc_now() -> dt.datetime: + return dt.datetime.now(dt.UTC) + + +def iso_z(value: dt.datetime | None = None) -> str: + value = value or utc_now() + return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def compact_timestamp(value: dt.datetime | None = None) -> str: + value = value or utc_now() + return value.astimezone(dt.UTC).strftime("%Y%m%dT%H%M%SZ") + + +def parse_iso(value: Any) -> dt.datetime | None: + if not isinstance(value, str) or not value.strip(): + return None + text = value.strip() + if text.endswith("Z"): + text = text[:-1] + "+00:00" + try: + parsed = dt.datetime.fromisoformat(text) + except ValueError: + return None + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=dt.UTC) + return parsed.astimezone(dt.UTC) + + +def sha256_bytes(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def parse_scalar(value: str) -> Any: + value = value.strip() + if not value: + return "" + if value[0] in {"'", '"'} and value[-1:] == value[0]: + return value[1:-1] + lower = value.lower() + if lower in {"true", "false"}: + return lower == "true" + if lower in {"null", "none"}: + return None + try: + return int(value) + except ValueError: + pass + try: + return float(value) + except ValueError: + return value + + +def load_flat_yaml(path: Path) -> dict[str, Any]: + config: dict[str, Any] = {} + if not path.exists(): + return config + for line_number, raw_line in enumerate(path.read_text(encoding="utf-8").splitlines(), 1): + line = raw_line.split("#", 1)[0].strip() + if not line: + continue + if ":" not in line: + raise ValueError(f"Unsupported config line {line_number}: {raw_line}") + key, value = line.split(":", 1) + key = key.strip() + if not key: + raise ValueError(f"Missing config key on line {line_number}") + config[key] = parse_scalar(value) + return config + + +def config_value(config: dict[str, Any], args: argparse.Namespace, key: str, default: Any) -> Any: + value = getattr(args, key, None) + if value is not None: + return value + env_key = "ORDERBOOKS_WS_" + key.upper() + if env_key in os.environ: + return parse_scalar(os.environ[env_key]) + return config.get(key, default) + + +def filter_headers(headers: Any) -> dict[str, str]: + safe: dict[str, str] = {} + for key, value in dict(headers).items(): + if key.lower() in SAFE_RESPONSE_HEADERS: + safe[key] = value + return safe + + +def dec(value: Any) -> Decimal: + if value is None: + return Decimal("0") + try: + return Decimal(str(value)) + except InvalidOperation: + return Decimal("0") + + +def dec_to_str(value: Decimal) -> str: + text = format(value, "f") + if "." in text: + text = text.rstrip("0").rstrip(".") + return text or "0" + + +def level_map(levels: Any) -> dict[str, Decimal]: + result: dict[str, Decimal] = {} + if not isinstance(levels, list): + return result + for item in levels: + if not isinstance(item, dict): + continue + price = dec_to_str(dec(item.get("price"))) + result[price] = dec(item.get("size")) + return result + + +def top_levels(book: dict[str, Decimal], side: str, top_n: int) -> list[tuple[str, Decimal]]: + return sorted(book.items(), key=lambda item: Decimal(item[0]), reverse=(side == "bids"))[:top_n] + + +def summarize_book(bids: dict[str, Decimal], asks: dict[str, Decimal], top_n: int) -> dict[str, Any]: + bid_levels = top_levels(bids, "bids", top_n) + ask_levels = top_levels(asks, "asks", top_n) + best_bid = bid_levels[0][0] if bid_levels else None + best_ask = ask_levels[0][0] if ask_levels else None + spread = dec_to_str(Decimal(best_ask) - Decimal(best_bid)) if best_bid and best_ask else None + return { + "best_bid": best_bid, + "best_ask": best_ask, + "spread": spread, + "bid_level_count": len(bids), + "ask_level_count": len(asks), + "top_bids": [{"price": price, "size": dec_to_str(size)} for price, size in bid_levels], + "top_asks": [{"price": price, "size": dec_to_str(size)} for price, size in ask_levels], + } + + +def compare_side(local: dict[str, Decimal], rest: dict[str, Decimal], side: str, top_n: int) -> dict[str, Any]: + local_top = dict(top_levels(local, side, top_n)) + rest_top = dict(top_levels(rest, side, top_n)) + missing = sorted(set(rest_top) - set(local_top), key=Decimal, reverse=(side == "bids")) + extra = sorted(set(local_top) - set(rest_top), key=Decimal, reverse=(side == "bids")) + size_deltas = [] + for price in sorted(set(local_top) & set(rest_top), key=Decimal, reverse=(side == "bids")): + delta = local_top[price] - rest_top[price] + if delta != 0: + size_deltas.append({ + "price": price, + "local_size": dec_to_str(local_top[price]), + "rest_size": dec_to_str(rest_top[price]), + "delta": dec_to_str(delta), + }) + return {"missing_prices": missing, "extra_prices": extra, "size_deltas": size_deltas} + + +class BookState: + def __init__(self, token_meta: dict[str, Any]) -> None: + self.token_meta = token_meta + self.bids: dict[str, Decimal] = {} + self.asks: dict[str, Decimal] = {} + self.initialized = False + self.messages_applied = 0 + self.messages_skipped = 0 + self.unknown_messages = 0 + self.last_update_received_at_utc: str | None = None + + def apply_book(self, item: dict[str, Any], received_at_utc: str) -> None: + self.bids = level_map(item.get("bids")) + self.asks = level_map(item.get("asks")) + self.initialized = True + self.messages_applied += 1 + self.last_update_received_at_utc = received_at_utc + + def apply_change(self, change: dict[str, Any], received_at_utc: str) -> str | None: + if not self.initialized: + self.messages_skipped += 1 + return "price_change_before_book_snapshot" + side = str(change.get("side") or "").upper() + price = dec_to_str(dec(change.get("price"))) + size = dec(change.get("size")) + if side == "BUY": + book_side = self.bids + elif side == "SELL": + book_side = self.asks + else: + self.messages_skipped += 1 + return f"unsupported_price_change_side:{side}" + if size == 0: + book_side.pop(price, None) + else: + book_side[price] = size + self.messages_applied += 1 + self.last_update_received_at_utc = received_at_utc + return None + + def summary(self, top_n: int) -> dict[str, Any]: + state_quality = "insufficient_events" + if self.initialized and self.messages_applied > 1: + state_quality = "initialized_and_updated" + elif self.initialized: + state_quality = "snapshot_only" + return { + "token": self.token_meta, + "initialized": self.initialized, + "state_quality": state_quality, + "messages_applied": self.messages_applied, + "messages_skipped": self.messages_skipped, + "unknown_messages": self.unknown_messages, + "last_update_received_at_utc": self.last_update_received_at_utc, + **summarize_book(self.bids, self.asks, top_n), + } + + +class ArchiveWriter: + def __init__(self, *, root: Path, subdir: str, prefix: str, run_id: str) -> None: + self.root = root + self.subdir = subdir + self.prefix = prefix + self.run_id = run_id + self.current_hour: str | None = None + self.temp_path: Path | None = None + self.final_path: Path | None = None + self.handle: gzip.GzipFile | None = None + self.rows = 0 + self.started_at_utc: str | None = None + self.closed_files: list[dict[str, Any]] = [] + + def _paths_for(self, when: dt.datetime) -> tuple[str, Path, Path]: + hour = when.astimezone(dt.UTC).strftime("%Y/%m/%d/%H") + hour_id = when.astimezone(dt.UTC).strftime("%Y%m%dT%H0000Z") + directory = self.root / self.subdir / hour + final_path = directory / f"{self.prefix}_{self.run_id}_{hour_id}.jsonl.gz" + temp_path = directory / f".{final_path.name}.open" + return hour, temp_path, final_path + + def ensure_open(self, when: dt.datetime | None = None) -> None: + when = when or utc_now() + hour, temp_path, final_path = self._paths_for(when) + if self.handle is not None and self.current_hour == hour: + return + self.close(ended_at=when) + temp_path.parent.mkdir(parents=True, exist_ok=True) + self.current_hour = hour + self.temp_path = temp_path + self.final_path = final_path + self.rows = 0 + self.started_at_utc = iso_z(when) + self.handle = gzip.open(temp_path, "at", encoding="utf-8") + + def write(self, row: dict[str, Any], when: dt.datetime | None = None) -> None: + self.ensure_open(when) + assert self.handle is not None + self.handle.write(json.dumps(row, separators=(",", ":"), sort_keys=True) + "\n") + self.rows += 1 + if self.rows % 100 == 0: + self.handle.flush() + + def close(self, ended_at: dt.datetime | None = None) -> None: + if self.handle is None: + return + ended_at = ended_at or utc_now() + self.handle.close() + assert self.temp_path is not None and self.final_path is not None + self.final_path.parent.mkdir(parents=True, exist_ok=True) + if self.final_path.exists(): + self.final_path.unlink() + self.temp_path.rename(self.final_path) + self.closed_files.append({ + "path": self.final_path.as_posix(), + "kind": self.prefix, + "started_at_utc": self.started_at_utc, + "ended_at_utc": iso_z(ended_at), + "rows": self.rows, + "bytes": self.final_path.stat().st_size, + "sha256": sha256_file(self.final_path), + "status": "valid" if self.rows > 0 else "empty", + }) + self.current_hour = None + self.temp_path = None + self.final_path = None + self.handle = None + self.rows = 0 + self.started_at_utc = None + + +def send_ws_frame(sock: ssl.SSLSocket, opcode: int, payload: bytes) -> None: + mask = os.urandom(4) + header = bytearray([0x80 | opcode]) + length = len(payload) + if length < 126: + header.append(0x80 | length) + elif length < 65536: + header.append(0x80 | 126) + header.extend(struct.pack("!H", length)) + else: + header.append(0x80 | 127) + header.extend(struct.pack("!Q", length)) + masked = bytes(byte ^ mask[index % 4] for index, byte in enumerate(payload)) + sock.sendall(header + mask + masked) + + +def read_exact(sock: ssl.SSLSocket, length: int) -> bytes: + data = bytearray() + while len(data) < length: + chunk = sock.recv(length - len(data)) + if not chunk: + raise EOFError("websocket connection closed while reading frame") + data.extend(chunk) + return bytes(data) + + +def read_ws_frame(sock: ssl.SSLSocket) -> tuple[int, bytes]: + first, second = read_exact(sock, 2) + opcode = first & 0x0F + length = second & 0x7F + masked = bool(second & 0x80) + if length == 126: + length = struct.unpack("!H", read_exact(sock, 2))[0] + elif length == 127: + length = struct.unpack("!Q", read_exact(sock, 8))[0] + mask = read_exact(sock, 4) if masked else b"" + payload = read_exact(sock, length) if length else b"" + if masked: + payload = bytes(byte ^ mask[index % 4] for index, byte in enumerate(payload)) + return opcode, payload + + +def parse_ws_headers(raw_headers: str) -> tuple[str, dict[str, str]]: + lines = raw_headers.split("\r\n") + status_line = lines[0] if lines else "" + headers: dict[str, str] = {} + for line in lines[1:]: + if ":" not in line: + continue + key, value = line.split(":", 1) + headers[key.strip()] = value.strip() + return status_line, filter_headers(headers) + + +def open_websocket(url: str, timeout_seconds: float) -> tuple[ssl.SSLSocket, dict[str, Any]]: + parsed = urllib.parse.urlparse(url) + host = parsed.hostname + if not host: + raise ValueError("missing websocket host") + port = parsed.port or 443 + path = parsed.path or "/" + if parsed.query: + path = f"{path}?{parsed.query}" + raw_sock = socket.create_connection((host, port), timeout=timeout_seconds) + sock = ssl.create_default_context().wrap_socket(raw_sock, server_hostname=host) + sock.settimeout(timeout_seconds) + key = base64.b64encode(os.urandom(16)).decode("ascii") + request = ( + f"GET {path} HTTP/1.1\r\n" + f"Host: {host}\r\n" + "Upgrade: websocket\r\n" + "Connection: Upgrade\r\n" + f"Sec-WebSocket-Key: {key}\r\n" + "Sec-WebSocket-Version: 13\r\n" + f"User-Agent: orderbooks-polymarket-ws-recorder/{COLLECTOR_VERSION}\r\n" + "\r\n" + ) + sock.sendall(request.encode("ascii")) + raw_headers = bytearray() + while b"\r\n\r\n" not in raw_headers: + raw_headers.extend(sock.recv(4096)) + if len(raw_headers) > 65536: + raise ValueError("websocket handshake headers exceeded 64 KiB") + header_text = bytes(raw_headers).split(b"\r\n\r\n", 1)[0].decode("iso-8859-1", errors="replace") + status_line, response_headers = parse_ws_headers(header_text) + if " 101 " not in status_line: + raise ValueError(f"websocket upgrade failed: {status_line}") + return sock, {"status_line": status_line, "headers": response_headers} + + +def decode_json_maybe(text: str) -> tuple[Any | None, str | None]: + try: + return json.loads(text), None + except json.JSONDecodeError as exc: + return None, str(exc) + + +def classify_payload(payload: Any) -> list[str]: + items = payload if isinstance(payload, list) else [payload] + event_types: list[str] = [] + for item in items: + if not isinstance(item, dict): + event_types.append(type(item).__name__) + elif item.get("event_type"): + event_types.append(str(item["event_type"])) + elif {"asset_id", "bids", "asks"}.issubset(item.keys()): + event_types.append("book") + else: + event_types.append("unknown_object") + return event_types + + +def raw_items(payload: Any) -> list[dict[str, Any]]: + items = payload if isinstance(payload, list) else [payload] + return [item for item in items if isinstance(item, dict)] + + +def load_discovery(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def market_is_usable(market: dict[str, Any], now: dt.datetime, safety_seconds: int) -> tuple[bool, list[str]]: + reasons: list[str] = [] + if market.get("active") is not True: + reasons.append("not_active") + if market.get("closed") is not False: + reasons.append("closed") + if market.get("accepting_orders") is not True: + reasons.append("not_accepting_orders") + if market.get("enable_order_book") is not True: + reasons.append("order_book_not_enabled") + end_time = parse_iso(market.get("end_time_utc")) + if end_time is None: + reasons.append("missing_end_time") + elif end_time <= now + dt.timedelta(seconds=safety_seconds): + reasons.append("too_close_to_end_or_expired") + tokens = market.get("tokens") + if not isinstance(tokens, list) or len(tokens) < 2: + reasons.append("missing_two_tokens") + else: + outcomes = [token.get("outcome") for token in tokens if isinstance(token, dict)] + token_ids = [token.get("token_id") for token in tokens if isinstance(token, dict)] + if outcomes[:2] != ["Up", "Down"] or not all(token_ids[:2]): + reasons.append("bad_up_down_token_mapping") + return not reasons, reasons + + +def select_markets(discovery: dict[str, Any], market_limit: int, market_end_safety_seconds: int) -> tuple[list[dict[str, Any]], dict[str, int]]: + now = utc_now() + selected: list[dict[str, Any]] = [] + rejection_counts: dict[str, int] = {} + for market in discovery.get("normalized_markets") or []: + if not isinstance(market, dict): + rejection_counts["not_object"] = rejection_counts.get("not_object", 0) + 1 + continue + usable, reasons = market_is_usable(market, now, market_end_safety_seconds) + if not usable: + for reason in reasons: + rejection_counts[reason] = rejection_counts.get(reason, 0) + 1 + continue + selected.append(market) + if market_limit > 0 and len(selected) >= market_limit: + break + return selected, dict(sorted(rejection_counts.items())) + + +def flatten_tokens(markets: list[dict[str, Any]]) -> list[dict[str, Any]]: + tokens: list[dict[str, Any]] = [] + for market in markets: + for token in market.get("tokens", [])[:2]: + if not isinstance(token, dict): + continue + tokens.append({ + "market_name": market.get("market_name"), + "market_slug": market.get("market_slug"), + "condition_id": market.get("condition_id"), + "token_id": str(token.get("token_id")), + "outcome": token.get("outcome"), + "outcome_index": token.get("outcome_index"), + "market_end_time_utc": market.get("end_time_utc"), + }) + return tokens + + +def run_discovery(config: dict[str, Any], counters: dict[str, Any], warnings: list[str], errors: list[dict[str, Any]]) -> None: + script_path = Path(config["discovery_script_path"]) + if not config["discovery_execute"]: + return + if not script_path.exists(): + warnings.append(f"discovery script missing; using existing artifact only: {script_path}") + return + output_json = Path(config["discovery_path"]) + output_json.parent.mkdir(parents=True, exist_ok=True) + cmd = [ + sys.executable, + script_path.as_posix(), + "--output-json", output_json.as_posix(), + "--manifest", str(Path(config["discovery_dir"]) / "polymarket_btc_markets_manifest.json"), + "--markdown", str(Path(config["discovery_dir"]) / "polymarket_btc_markets.md"), + "--max-pages", str(config["discovery_max_pages"]), + "--limit", str(config["discovery_page_limit"]), + "--timeout", str(config["request_timeout_seconds"]), + ] + started = time.monotonic() + result = subprocess.run(cmd, text=True, capture_output=True, timeout=max(30, int(config["request_timeout_seconds"] * (config["discovery_max_pages"] + 1)))) + counters["discovery_refresh_count"] += 1 + counters["last_discovery_duration_ms"] = round((time.monotonic() - started) * 1000, 3) + if result.returncode != 0: + counters["discovery_failure_count"] += 1 + errors.append({ + "stage": "discovery_refresh", + "returncode": result.returncode, + "stderr_tail": result.stderr[-2000:], + "stdout_tail": result.stdout[-1000:], + }) + + +def refresh_market_state(config: dict[str, Any], counters: dict[str, Any], warnings: list[str], errors: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], list[dict[str, Any]], dict[str, int]]: + run_discovery(config, counters, warnings, errors) + discovery_path = Path(config["discovery_path"]) + discovery = load_discovery(discovery_path) + markets, rejection_counts = select_markets(discovery, int(config["market_limit"]), int(config["market_end_safety_seconds"])) + tokens = flatten_tokens(markets) + counters["markets_tracked"] = len(markets) + counters["tokens_tracked"] = len(tokens) + counters["last_discovery_path"] = discovery_path.as_posix() + counters["last_discovery_sha256"] = sha256_file(discovery_path) if discovery_path.exists() else None + return markets, tokens, rejection_counts + + +def apply_payload_to_books(payload: Any, states: dict[str, BookState], received_at_utc: str, counters: dict[str, Any], warnings: list[str]) -> None: + for item in raw_items(payload): + event_type = item.get("event_type") or ("book" if {"asset_id", "bids", "asks"}.issubset(item.keys()) else "unknown_object") + counters["event_type_counts"][event_type] = counters["event_type_counts"].get(event_type, 0) + 1 + if event_type == "book": + state = states.get(str(item.get("asset_id") or "")) + if state: + state.apply_book(item, received_at_utc) + elif event_type == "price_change": + changes = item.get("price_changes") + if not isinstance(changes, list): + counters["parser_warning_count"] += 1 + warnings.append("price_change event without price_changes list") + continue + for change in changes: + if not isinstance(change, dict): + continue + state = states.get(str(change.get("asset_id") or "")) + if not state: + continue + reason = state.apply_change(change, received_at_utc) + if reason: + counters["book_update_skip_counts"][reason] = counters["book_update_skip_counts"].get(reason, 0) + 1 + elif event_type in {"best_bid_ask", "last_trade_price", "new_market"}: + counters["non_mutating_event_counts"][event_type] = counters["non_mutating_event_counts"].get(event_type, 0) + 1 + else: + counters["unknown_event_counts"][event_type] = counters["unknown_event_counts"].get(event_type, 0) + 1 + for state in states.values(): + state.unknown_messages += 1 + + +def compare_state_to_rest(state: BookState, rest_item: dict[str, Any], top_n: int) -> dict[str, Any]: + rest_bids = level_map(rest_item.get("bids")) + rest_asks = level_map(rest_item.get("asks")) + local_summary = summarize_book(state.bids, state.asks, top_n) + rest_summary = summarize_book(rest_bids, rest_asks, top_n) + bid_diff = compare_side(state.bids, rest_bids, "bids", top_n) + ask_diff = compare_side(state.asks, rest_asks, "asks", top_n) + best_bid_affected = local_summary["best_bid"] != rest_summary["best_bid"] + best_ask_affected = local_summary["best_ask"] != rest_summary["best_ask"] + spread_affected = local_summary["spread"] != rest_summary["spread"] + level_count_affected = local_summary["bid_level_count"] != rest_summary["bid_level_count"] or local_summary["ask_level_count"] != rest_summary["ask_level_count"] + price_membership_affected = bool(bid_diff["missing_prices"] or bid_diff["extra_prices"] or ask_diff["missing_prices"] or ask_diff["extra_prices"]) + size_delta_count = len(bid_diff["size_deltas"]) + len(ask_diff["size_deltas"]) + divergent = any([best_bid_affected, best_ask_affected, spread_affected, level_count_affected, price_membership_affected, size_delta_count]) + return { + "comparison_status": "divergent" if divergent else "match", + "best_bid_affected": best_bid_affected, + "best_ask_affected": best_ask_affected, + "spread_affected": spread_affected, + "level_count_affected": level_count_affected, + "price_membership_affected": price_membership_affected, + "size_only_divergent": bool(size_delta_count) and not any([best_bid_affected, best_ask_affected, spread_affected, level_count_affected, price_membership_affected]), + "bid_size_delta_count": len(bid_diff["size_deltas"]), + "ask_size_delta_count": len(ask_diff["size_deltas"]), + } + + +def http_post_books(url: str, token_ids: list[str], timeout_seconds: float) -> dict[str, Any]: + requested_at_utc = iso_z() + started = time.monotonic() + request_body = [{"token_id": token_id} for token_id in token_ids] + body_bytes = json.dumps(request_body, separators=(",", ":")).encode("utf-8") + status_code: int | None = None + headers: dict[str, str] = {} + response_text = "" + error: str | None = None + try: + request = urllib.request.Request( + url, + data=body_bytes, + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + "User-Agent": f"orderbooks-polymarket-ws-recorder/{COLLECTOR_VERSION}", + }, + method="POST", + ) + with urllib.request.urlopen(request, timeout=timeout_seconds) as response: + status_code = response.status + headers = filter_headers(response.headers) + response_text = response.read().decode("utf-8", errors="replace") + except urllib.error.HTTPError as exc: + status_code = exc.code + headers = filter_headers(exc.headers) + response_text = exc.read().decode("utf-8", errors="replace") + error = f"HTTPError: {exc}" + except Exception as exc: # noqa: BLE001 - preserve failure evidence + error = f"{type(exc).__name__}: {exc}" + parsed_json, json_error = decode_json_maybe(response_text) if response_text else (None, None) + return { + "requested_at_utc": requested_at_utc, + "received_at_utc": iso_z(), + "duration_ms": round((time.monotonic() - started) * 1000, 3), + "request_body": request_body, + "status_code": status_code, + "headers": headers, + "raw_response_json": parsed_json, + "json_error": json_error, + "raw_response_text_sha256": sha256_bytes(response_text.encode("utf-8")), + "raw_response_length_bytes": len(response_text.encode("utf-8")), + "raw_response_text_preview": response_text[:1000] if parsed_json is None else None, + "error": error, + "ok": error is None and status_code is not None and 200 <= status_code < 300 and json_error is None, + } + + +def fetch_rest_checkpoint( + *, + config: dict[str, Any], + rest_writer: ArchiveWriter, + checkpoint_sequence: int, + tokens: list[dict[str, Any]], + states: dict[str, BookState], + counters: dict[str, Any], +) -> None: + token_ids = [token["token_id"] for token in tokens] + batch_size = max(1, int(config["rest_batch_size"])) + for batch_index, start in enumerate(range(0, len(token_ids), batch_size), 1): + batch = token_ids[start:start + batch_size] + response = http_post_books(config["clob_books_url"], batch, float(config["request_timeout_seconds"])) + counters["rest_request_count"] += 1 + if response["ok"]: + counters["rest_success_count"] += 1 + else: + counters["rest_failure_count"] += 1 + if response.get("status_code") == 429: + counters["rest_rate_limit_count"] += 1 + comparison = { + "match_count": 0, + "divergent_count": 0, + "no_state_count": 0, + "best_bid_affected_count": 0, + "best_ask_affected_count": 0, + "spread_affected_count": 0, + "level_count_affected_count": 0, + "price_membership_affected_count": 0, + "size_only_divergent_count": 0, + "bid_size_delta_count": 0, + "ask_size_delta_count": 0, + } + payload = response.get("raw_response_json") + if isinstance(payload, list): + for rest_item in payload: + if not isinstance(rest_item, dict): + continue + token_id = str(rest_item.get("asset_id") or "") + state = states.get(token_id) + if not state or not state.initialized: + comparison["no_state_count"] += 1 + continue + cmp = compare_state_to_rest(state, rest_item, int(config["top_n"])) + if cmp["comparison_status"] == "match": + comparison["match_count"] += 1 + else: + comparison["divergent_count"] += 1 + for key in ["best_bid_affected", "best_ask_affected", "spread_affected", "level_count_affected", "price_membership_affected", "size_only_divergent"]: + if cmp[key]: + comparison[f"{key}_count"] += 1 + comparison["bid_size_delta_count"] += cmp["bid_size_delta_count"] + comparison["ask_size_delta_count"] += cmp["ask_size_delta_count"] + for key, value in comparison.items(): + counters["rest_comparison_counts"][key] = counters["rest_comparison_counts"].get(key, 0) + value + rest_writer.write({ + "schema_name": REST_SCHEMA_NAME, + "schema_version": SCHEMA_VERSION, + "collector": {"name": COLLECTOR_NAME, "version": COLLECTOR_VERSION}, + "checkpoint_sequence": checkpoint_sequence, + "batch_index": batch_index, + "batch_count": (len(token_ids) + batch_size - 1) // batch_size, + "token_ids": batch, + "tokens_tracked_count": len(token_ids), + "response": response, + "comparison_summary": comparison, + }) + + +def build_ws_envelope( + *, + run_id: str, + session_id: str, + connection_sequence: int, + message_sequence: int, + global_message_sequence: int, + received_at_utc: str, + websocket_url: str, + subscription: dict[str, Any], + tokens: list[dict[str, Any]], + opcode: int, + payload_bytes: bytes, +) -> tuple[dict[str, Any], Any | None, list[str], bool]: + decode_error = None + try: + raw_text = payload_bytes.decode("utf-8") + except UnicodeDecodeError as exc: + decode_error = str(exc) + raw_text = payload_bytes.decode("utf-8", errors="replace") + parsed_json, json_error = decode_json_maybe(raw_text) if decode_error is None else (None, decode_error) + event_types = classify_payload(parsed_json) if parsed_json is not None else ["unparseable_text"] + envelope = { + "schema_name": WS_SCHEMA_NAME, + "schema_version": SCHEMA_VERSION, + "collector": {"name": COLLECTOR_NAME, "version": COLLECTOR_VERSION}, + "run_id": run_id, + "session_id": session_id, + "connection_sequence": connection_sequence, + "message_sequence": message_sequence, + "global_message_sequence": global_message_sequence, + "received_at_utc": received_at_utc, + "websocket": {"url": websocket_url}, + "subscription": subscription, + "tokens_tracked": tokens, + "opcode": opcode, + "payload_length_bytes": len(payload_bytes), + "payload_sha256": sha256_bytes(payload_bytes), + "raw_text": raw_text, + "json": parsed_json, + "json_error": json_error, + "classified_event_types": event_types, + } + return envelope, parsed_json, event_types, parsed_json is not None + + +def initial_counters() -> dict[str, Any]: + return { + "websocket_message_count": 0, + "websocket_parsed_json_count": 0, + "websocket_parse_error_count": 0, + "websocket_opcode_counts": {}, + "event_type_counts": {}, + "non_mutating_event_counts": {}, + "unknown_event_counts": {}, + "book_update_skip_counts": {}, + "parser_warning_count": 0, + "connection_count": 0, + "reconnect_count": 0, + "subscription_change_count": 0, + "stale_feed_count": 0, + "max_gap_seconds": None, + "last_message_received_at_utc": None, + "rest_request_count": 0, + "rest_success_count": 0, + "rest_failure_count": 0, + "rest_rate_limit_count": 0, + "rest_comparison_counts": {}, + "discovery_refresh_count": 0, + "discovery_failure_count": 0, + "last_discovery_duration_ms": None, + "markets_tracked": 0, + "tokens_tracked": 0, + "last_discovery_path": None, + "last_discovery_sha256": None, + } + + +def summarize_states(states: dict[str, BookState], top_n: int) -> list[dict[str, Any]]: + return [state.summary(top_n) for state in states.values()] + + +def write_manifest( + *, + config: dict[str, Any], + run_id: str, + started_at_utc: str, + status: str, + gate_status: str, + shutdown_reason: str | None, + markets: list[dict[str, Any]], + tokens: list[dict[str, Any]], + counters: dict[str, Any], + ws_writer: ArchiveWriter, + rest_writer: ArchiveWriter, + states: dict[str, BookState], + warnings: list[str], + errors: list[dict[str, Any]], +) -> dict[str, Any]: + manifest = { + "schema_name": MANIFEST_SCHEMA_NAME, + "schema_version": 1, + "collector": {"name": COLLECTOR_NAME, "version": COLLECTOR_VERSION}, + "run_id": run_id, + "started_at_utc": started_at_utc, + "updated_at_utc": iso_z(), + "status": status, + "gate_status": gate_status, + "shutdown_reason": shutdown_reason, + "production_ready": False, + "public_data_only": True, + "trading_enabled": False, + "command": config.get("command"), + "config": public_config(config), + "markets_tracked": markets, + "tokens_tracked": tokens, + "counters": counters, + "state_summary": summarize_states(states, int(config["top_n"])), + "output_files": [*ws_writer.closed_files, *rest_writer.closed_files], + "open_files": [path.as_posix() for path in [ws_writer.temp_path, rest_writer.temp_path] if path is not None], + "warnings": sorted(set(warnings)), + "errors": errors[-20:], + } + manifest_dir = Path(config["manifest_dir"]) + manifest_dir.mkdir(parents=True, exist_ok=True) + manifest_path = Path(config["manifest_path"]) + manifest_path.parent.mkdir(parents=True, exist_ok=True) + immutable_path = manifest_dir / f"polymarket_ws_recorder_{run_id}_{compact_timestamp()}.json" + text = json.dumps(manifest, indent=2, sort_keys=True) + "\n" + manifest_path.write_text(text, encoding="utf-8") + immutable_path.write_text(text, encoding="utf-8") + return manifest + + +def public_config(config: dict[str, Any]) -> dict[str, Any]: + result = {} + for key, value in config.items(): + if isinstance(value, Path): + result[key] = value.as_posix() + else: + result[key] = value + return result + + +def build_config(args: argparse.Namespace) -> dict[str, Any]: + file_config = load_flat_yaml(args.config) if args.config else {} + duration = config_value(file_config, args, "duration_seconds", None) + config = { + "config_path": args.config.as_posix() if args.config else None, + "config_sha256": sha256_file(args.config) if args.config and args.config.exists() else None, + "discovery_path": Path(config_value(file_config, args, "discovery_path", DEFAULT_DISCOVERY_PATH)), + "discovery_dir": Path(config_value(file_config, args, "discovery_dir", DEFAULT_DISCOVERY_DIR)), + "discovery_script_path": Path(config_value(file_config, args, "discovery_script_path", DISCOVERY_SCRIPT)), + "discovery_execute": bool(config_value(file_config, args, "discovery_execute", True)), + "discovery_refresh_interval_seconds": float(config_value(file_config, args, "discovery_refresh_interval_seconds", 600)), + "discovery_max_pages": int(config_value(file_config, args, "discovery_max_pages", 3)), + "discovery_page_limit": int(config_value(file_config, args, "discovery_page_limit", 100)), + "raw_output_root": Path(config_value(file_config, args, "raw_output_root", DEFAULT_RAW_OUTPUT_ROOT)), + "manifest_dir": Path(config_value(file_config, args, "manifest_dir", DEFAULT_MANIFEST_DIR)), + "manifest_path": Path(config_value(file_config, args, "manifest_path", DEFAULT_MANIFEST_PATH)), + "websocket_url": str(config_value(file_config, args, "websocket_url", MARKET_WS_URL)), + "clob_books_url": str(config_value(file_config, args, "clob_books_url", CLOB_BOOKS_URL)), + "market_limit": int(config_value(file_config, args, "market_limit", 0) or 0), + "market_end_safety_seconds": int(config_value(file_config, args, "market_end_safety_seconds", 420)), + "rest_checkpoint_interval_seconds": float(config_value(file_config, args, "rest_checkpoint_interval_seconds", 60)), + "rest_batch_size": int(config_value(file_config, args, "rest_batch_size", 50)), + "top_n": int(config_value(file_config, args, "top_n", 10)), + "stale_feed_threshold_seconds": float(config_value(file_config, args, "stale_feed_threshold_seconds", 30)), + "request_timeout_seconds": float(config_value(file_config, args, "request_timeout_seconds", 15)), + "websocket_timeout_seconds": float(config_value(file_config, args, "websocket_timeout_seconds", 10)), + "reconnect_backoff_seconds": float(config_value(file_config, args, "reconnect_backoff_seconds", 3)), + "max_reconnect_backoff_seconds": float(config_value(file_config, args, "max_reconnect_backoff_seconds", 60)), + "duration_seconds": float(duration) if duration is not None else None, + "manifest_write_interval_seconds": float(config_value(file_config, args, "manifest_write_interval_seconds", 300)), + } + if config["rest_batch_size"] < 1: + raise ValueError("rest_batch_size must be >= 1") + if config["market_limit"] < 0: + raise ValueError("market_limit must be >= 0; use 0 for all active BTC markets") + return config + + +def run_recorder(config: dict[str, Any], command: str) -> dict[str, Any]: + signal.signal(signal.SIGINT, handle_stop) + signal.signal(signal.SIGTERM, handle_stop) + started = utc_now() + started_at_utc = iso_z(started) + run_id = compact_timestamp(started) + deadline = time.monotonic() + float(config["duration_seconds"]) if config["duration_seconds"] else None + counters = initial_counters() + warnings: list[str] = [] + errors: list[dict[str, Any]] = [] + markets: list[dict[str, Any]] = [] + tokens: list[dict[str, Any]] = [] + states: dict[str, BookState] = {} + rejection_counts: dict[str, int] = {} + ws_writer = ArchiveWriter(root=Path(config["raw_output_root"]), subdir="polymarket/ws_raw", prefix="polymarket_ws_raw", run_id=run_id) + rest_writer = ArchiveWriter(root=Path(config["raw_output_root"]), subdir="polymarket/rest_checkpoints", prefix="polymarket_rest_checkpoints", run_id=run_id) + + next_discovery = 0.0 + next_rest_checkpoint = 0.0 + next_manifest_write = time.monotonic() + float(config["manifest_write_interval_seconds"]) + checkpoint_sequence = 0 + global_sequence = 0 + connection_sequence = 0 + reconnect_backoff = float(config["reconnect_backoff_seconds"]) + last_text_message_monotonic: float | None = None + shutdown_reason: str | None = None + + try: + while not STOP_REQUESTED: + if deadline is not None and time.monotonic() >= deadline: + shutdown_reason = "duration_elapsed" + break + if time.monotonic() >= next_discovery or not tokens: + old_token_ids = [token["token_id"] for token in tokens] + try: + markets, tokens, rejection_counts = refresh_market_state(config, counters, warnings, errors) + except Exception as exc: # noqa: BLE001 - preserve evidence and retry + counters["discovery_failure_count"] += 1 + errors.append({"stage": "load_discovery", "error": f"{type(exc).__name__}: {exc}"}) + time.sleep(min(reconnect_backoff, 30)) + next_discovery = time.monotonic() + float(config["discovery_refresh_interval_seconds"]) + continue + new_token_ids = [token["token_id"] for token in tokens] + if old_token_ids and old_token_ids != new_token_ids: + counters["subscription_change_count"] += 1 + states = {token["token_id"]: states.get(token["token_id"], BookState(token)) for token in tokens} + next_discovery = time.monotonic() + float(config["discovery_refresh_interval_seconds"]) + if not tokens: + warnings.append("no active BTC Up/Down tokens available after discovery") + time.sleep(min(reconnect_backoff, 30)) + continue + + token_ids = [token["token_id"] for token in tokens] + subscription = {"assets_ids": token_ids, "type": "market", "custom_feature_enabled": True} + connection_sequence += 1 + session_id = f"{run_id}-ws{connection_sequence}" + sock: ssl.SSLSocket | None = None + try: + sock, handshake = open_websocket(str(config["websocket_url"]), float(config["websocket_timeout_seconds"])) + counters["connection_count"] += 1 + send_ws_frame(sock, 1, json.dumps(subscription, separators=(",", ":")).encode("utf-8")) + session_message_sequence = 0 + session_started_at_utc = iso_z() + reconnect_backoff = float(config["reconnect_backoff_seconds"]) + while not STOP_REQUESTED: + now_monotonic = time.monotonic() + if deadline is not None and now_monotonic >= deadline: + shutdown_reason = "duration_elapsed" + break + if now_monotonic >= next_rest_checkpoint: + checkpoint_sequence += 1 + fetch_rest_checkpoint(config=config, rest_writer=rest_writer, checkpoint_sequence=checkpoint_sequence, tokens=tokens, states=states, counters=counters) + next_rest_checkpoint = now_monotonic + float(config["rest_checkpoint_interval_seconds"]) + if now_monotonic >= next_manifest_write: + write_manifest(config=config, run_id=run_id, started_at_utc=started_at_utc, status="RUNNING", gate_status="IN_PROGRESS", shutdown_reason=None, markets=markets, tokens=tokens, counters=counters, ws_writer=ws_writer, rest_writer=rest_writer, states=states, warnings=warnings, errors=errors) + next_manifest_write = now_monotonic + float(config["manifest_write_interval_seconds"]) + if now_monotonic >= next_discovery: + previous_token_ids = token_ids + markets, tokens, rejection_counts = refresh_market_state(config, counters, warnings, errors) + token_ids = [token["token_id"] for token in tokens] + next_discovery = now_monotonic + float(config["discovery_refresh_interval_seconds"]) + if token_ids != previous_token_ids: + counters["subscription_change_count"] += 1 + states = {token["token_id"]: states.get(token["token_id"], BookState(token)) for token in tokens} + raise RuntimeError("tracked token set changed; reconnecting with new subscription") + if last_text_message_monotonic is not None: + silence = now_monotonic - last_text_message_monotonic + if silence > float(config["stale_feed_threshold_seconds"]): + counters["stale_feed_count"] += 1 + raise TimeoutError(f"stale websocket feed for {silence:.3f}s") + try: + opcode, payload = read_ws_frame(sock) + except socket.timeout: + continue + counters["websocket_opcode_counts"][str(opcode)] = counters["websocket_opcode_counts"].get(str(opcode), 0) + 1 + if opcode == 1: + received_at_utc = iso_z() + now_for_gap = time.monotonic() + if last_text_message_monotonic is not None: + gap = round(now_for_gap - last_text_message_monotonic, 3) + previous = counters.get("max_gap_seconds") + counters["max_gap_seconds"] = gap if previous is None else max(previous, gap) + last_text_message_monotonic = now_for_gap + counters["last_message_received_at_utc"] = received_at_utc + session_message_sequence += 1 + global_sequence += 1 + envelope, parsed_json, event_types, parse_ok = build_ws_envelope( + run_id=run_id, + session_id=session_id, + connection_sequence=connection_sequence, + message_sequence=session_message_sequence, + global_message_sequence=global_sequence, + received_at_utc=received_at_utc, + websocket_url=str(config["websocket_url"]), + subscription=subscription, + tokens=tokens, + opcode=opcode, + payload_bytes=payload, + ) + ws_writer.write(envelope) + counters["websocket_message_count"] += 1 + if parse_ok: + counters["websocket_parsed_json_count"] += 1 + apply_payload_to_books(parsed_json, states, received_at_utc, counters, warnings) + else: + counters["websocket_parse_error_count"] += 1 + elif opcode == 8: + raise EOFError("websocket close frame received") + elif opcode == 9: + send_ws_frame(sock, 10, payload) + elif opcode == 10: + continue + else: + warnings.append(f"ignored websocket opcode {opcode}") + if shutdown_reason: + break + except Exception as exc: # noqa: BLE001 - preserve reconnect evidence + errors.append({"stage": "websocket_session", "connection_sequence": connection_sequence, "error": f"{type(exc).__name__}: {exc}"}) + counters["reconnect_count"] += 1 + if STOP_REQUESTED: + shutdown_reason = STOP_SIGNAL or "stop_requested" + break + time.sleep(min(reconnect_backoff, float(config["max_reconnect_backoff_seconds"]))) + reconnect_backoff = min(reconnect_backoff * 2, float(config["max_reconnect_backoff_seconds"])) + finally: + if sock is not None: + try: + sock.close() + except OSError: + pass + if shutdown_reason: + break + finally: + if STOP_REQUESTED and shutdown_reason is None: + shutdown_reason = STOP_SIGNAL or "stop_requested" + ws_writer.close() + rest_writer.close() + + if shutdown_reason is None: + shutdown_reason = "loop_exited" + if counters["websocket_message_count"] > 0 and counters["rest_success_count"] > 0 and ws_writer.closed_files and rest_writer.closed_files: + gate_status = "PASS" + else: + gate_status = "BLOCKED_RUNTIME_EVIDENCE" + status = "INTERRUPTED" if STOP_SIGNAL else ("COMPLETED_BOUNDED" if config["duration_seconds"] else "STOPPED") + config["command"] = command + manifest = write_manifest(config=config, run_id=run_id, started_at_utc=started_at_utc, status=status, gate_status=gate_status, shutdown_reason=shutdown_reason, markets=markets, tokens=tokens, counters=counters, ws_writer=ws_writer, rest_writer=rest_writer, states=states, warnings=warnings, errors=errors) + return manifest + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Long-running Polymarket BTC websocket raw recorder with REST checkpoints.") + parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH) + parser.add_argument("--duration-seconds", type=float, default=None, help="Bounded run duration for smoke tests. Default is continuous runtime.") + parser.add_argument("--market-limit", type=int, default=None, help="Limit active BTC markets for smoke tests. Use 0 for all.") + parser.add_argument("--raw-output-root", type=Path, default=None) + parser.add_argument("--manifest-dir", type=Path, default=None) + parser.add_argument("--manifest-path", type=Path, default=None) + parser.add_argument("--discovery-path", type=Path, default=None) + parser.add_argument("--discovery-dir", type=Path, default=None) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + config = build_config(args) + command = " ".join([Path(sys.argv[0]).as_posix(), *sys.argv[1:]]) + manifest = run_recorder(config, command) + print(f"WS_RECORDER_MANIFEST={config['manifest_path']}") + print(f"WS_RECORDER_RUN_ID={manifest['run_id']}") + print(f"WS_RECORDER_GATE={manifest['gate_status']}") + return 0 if manifest["gate_status"] == "PASS" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/deploy/bootstrap_orderbooks_k8s.sh b/scripts/deploy/bootstrap_orderbooks_k8s.sh index fa95571..8c0810f 100755 --- a/scripts/deploy/bootstrap_orderbooks_k8s.sh +++ b/scripts/deploy/bootstrap_orderbooks_k8s.sh @@ -10,7 +10,7 @@ CI_KUBECONFIG_PATH="${CI_KUBECONFIG_PATH:-$PLATFORM_REPO_DIR/.state/hetzner/kube PROJECT_NAME="${PROJECT_NAME:-orderbooks}" PROJECT_NAMESPACE="${PROJECT_NAMESPACE:-orderbooks}" -PROJECT_DEPLOYMENTS="${PROJECT_DEPLOYMENTS:-orderbooks-collector}" +PROJECT_DEPLOYMENTS="${PROJECT_DEPLOYMENTS:-orderbooks-collector,orderbooks-ws-recorder}" PROJECT_REGISTRY_SECRET_NAME="${PROJECT_REGISTRY_SECRET_NAME:-orderbooks-registry-creds}" RCLONE_SECRET_NAME="${RCLONE_SECRET_NAME:-orderbooks-rclone-config}" RCLONE_SECRET_KEY="${RCLONE_SECRET_KEY:-rclone.conf}" @@ -65,7 +65,7 @@ load_env_defaults "$PLATFORM_RESOLVED_ENV_FILE" # env file may describe the platform repo itself, not this app repo. PROJECT_NAME="${ORDERBOOKS_PROJECT_NAME:-orderbooks}" PROJECT_NAMESPACE="${ORDERBOOKS_PROJECT_NAMESPACE:-orderbooks}" -PROJECT_DEPLOYMENTS="${ORDERBOOKS_PROJECT_DEPLOYMENTS:-orderbooks-collector}" +PROJECT_DEPLOYMENTS="${ORDERBOOKS_PROJECT_DEPLOYMENTS:-orderbooks-collector,orderbooks-ws-recorder}" PROJECT_REGISTRY_SECRET_NAME="${ORDERBOOKS_PROJECT_REGISTRY_SECRET_NAME:-orderbooks-registry-creds}" RCLONE_SECRET_NAME="${ORDERBOOKS_RCLONE_SECRET_NAME:-orderbooks-rclone-config}" RCLONE_SECRET_KEY="${ORDERBOOKS_RCLONE_SECRET_KEY:-rclone.conf}" diff --git a/scripts/deploy/deploy_ws_canary_kaniko.sh b/scripts/deploy/deploy_ws_canary_kaniko.sh new file mode 100755 index 0000000..80751ff --- /dev/null +++ b/scripts/deploy/deploy_ws_canary_kaniko.sh @@ -0,0 +1,218 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "$0")/../.." && pwd)" +KUBECONFIG_PATH="${KUBECONFIG_PATH:-/home/philipp/dev/ae/nuri/unrip3/.state/hetzner/kubeconfig.yaml}" +NAMESPACE="${PROJECT_NAMESPACE:-orderbooks}" +REGISTRY_HOST="${REGISTRY_HOST:-registry.doran.133011.xyz}" +PROJECT_NAME="${PROJECT_NAME:-orderbooks}" +REGISTRY_SECRET_NAME="${PROJECT_REGISTRY_SECRET_NAME:-orderbooks-registry-creds}" +REPO_CLONE_URL="${REPO_CLONE_URL:-https://git.doran.133011.xyz/philipp/orderbooks.git}" +GIT_REF="$(git -C "$ROOT_DIR" rev-parse HEAD)" +IMAGE_TAG="" +OUTPUT_PATH="" +SERVER_DRY_RUN=0 +SKIP_BUILD=0 + +usage() { + cat <<'EOF' +Usage: scripts/deploy/deploy_ws_canary_kaniko.sh [options] + +Canary-only build/deploy path for orderbooks-ws-recorder. It does not apply +or roll deployment-collector.yaml and does not set the orderbooks-collector +image. + +Options: + --git-ref SHA Committed Git SHA to build. Default: local HEAD. + --image-tag TAG Image tag. Default: ws-canary--. + --output PATH Local deploy evidence JSON path. + --server-dry-run Do not build. Server-dry-run only the canary apply set. + --skip-build Skip Kaniko build and use REGISTRY_HOST/PROJECT_NAME:TAG. + --help Show help. +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --git-ref) GIT_REF="$2"; shift 2 ;; + --image-tag) IMAGE_TAG="$2"; shift 2 ;; + --output) OUTPUT_PATH="$2"; shift 2 ;; + --server-dry-run) SERVER_DRY_RUN=1; shift ;; + --skip-build) SKIP_BUILD=1; shift ;; + --help) usage; exit 0 ;; + *) echo "unknown argument: $1" >&2; usage >&2; exit 2 ;; + esac +done + +require() { command -v "$1" >/dev/null 2>&1 || { echo "missing required command: $1" >&2; exit 2; }; } +require kubectl +require python3 +require git + +export KUBECONFIG="${KUBECONFIG:-$KUBECONFIG_PATH}" +[[ -f "$KUBECONFIG" ]] || { echo "missing kubeconfig" >&2; exit 2; } + +short_sha="$(printf '%s' "$GIT_REF" | cut -c1-12 | tr '[:upper:]' '[:lower:]')" +if [[ -z "$IMAGE_TAG" ]]; then + IMAGE_TAG="ws-canary-${short_sha}-$(date -u +%Y%m%dT%H%M%SZ | tr '[:upper:]' '[:lower:]')" +fi +IMAGE="${REGISTRY_HOST}/${PROJECT_NAME}:${IMAGE_TAG}" +RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)" +if [[ -z "$OUTPUT_PATH" ]]; then + OUTPUT_PATH="${ROOT_DIR}/data/manifests/ws_canary_deploy_${RUN_ID}.json" +fi +mkdir -p "$(dirname "$OUTPUT_PATH")" +TMPDIR="$(mktemp -d)" +trap 'rm -rf "$TMPDIR"' EXIT + +REST_IMAGE_BEFORE="$(kubectl -n "$NAMESPACE" get deployment orderbooks-collector -o jsonpath='{.spec.template.spec.containers[0].image}' 2>/dev/null || true)" +REST_READY_BEFORE="$(kubectl -n "$NAMESPACE" get deployment orderbooks-collector -o jsonpath='{.status.readyReplicas}/{.spec.replicas}' 2>/dev/null || true)" + +render_canary() { + python3 - "$ROOT_DIR" "$IMAGE" <<'PY_RENDER' +import sys +from pathlib import Path +root=Path(sys.argv[1]) +image=sys.argv[2] +files=[ + 'deploy/k8s/base/namespace.yaml', + 'deploy/k8s/base/configmap.yaml', + 'deploy/k8s/base/pvc.yaml', + 'deploy/k8s/base/cronjob-uploader.yaml', + 'deploy/k8s/base/deployment-ws-recorder.yaml', +] +for index, rel in enumerate(files): + if index: + print('---') + text=(root/rel).read_text() + text=text.replace('registry.doran.133011.xyz/orderbooks:bootstrap', image) + print(text.rstrip()) +PY_RENDER +} + +if [[ "$SERVER_DRY_RUN" -eq 1 ]]; then + render_canary | kubectl apply --dry-run=server -f - + cat >"$OUTPUT_PATH" </dev/null 2>&1; then + # ls-remote by raw SHA may not match refs; accept if the commit is reachable from main. + remote_main="$(GIT_TERMINAL_PROMPT=0 git ls-remote "$REPO_CLONE_URL" refs/heads/main | awk '{print $1}')" + if [[ "$remote_main" != "$GIT_REF" ]]; then + echo "git ref is not confirmed on Forgejo main; push the source commit first" >&2 + exit 3 + fi +fi + +BUILD_JOB="orderbooks-ws-build-${short_sha}" +BUILD_JOB="$(printf '%s' "$BUILD_JOB" | tr -cs 'a-z0-9-' '-' | sed 's/^-//;s/-$//' | cut -c1-63)" + +if [[ "$SKIP_BUILD" -eq 0 ]]; then + kubectl -n "$NAMESPACE" delete job "$BUILD_JOB" --ignore-not-found=true >/dev/null + cat >"$TMPDIR/build-job.yaml" <- + git clone --depth=1 --branch main "${REPO_CLONE_URL}" /workspace && + cd /workspace && + git checkout --detach "${GIT_REF}" + volumeMounts: + - name: workspace + mountPath: /workspace + containers: + - name: kaniko + image: gcr.io/kaniko-project/executor:v1.23.2-debug + args: + - --context=/workspace + - --dockerfile=/workspace/Dockerfile + - --destination=${IMAGE} + - --cache=false + volumeMounts: + - name: workspace + mountPath: /workspace + - name: registry-creds + mountPath: /kaniko/.docker +EOF_JOB + kubectl apply -f "$TMPDIR/build-job.yaml" >/dev/null + kubectl -n "$NAMESPACE" wait --for=condition=Complete --timeout=20m "job/${BUILD_JOB}" >/dev/null +fi +BUILD_LOG_TAIL="$(kubectl -n "$NAMESPACE" logs "job/${BUILD_JOB}" --tail=120 2>/dev/null || true)" + +render_canary | kubectl apply -f - >/dev/null +kubectl -n "$NAMESPACE" rollout status deployment/orderbooks-ws-recorder --timeout=300s >/dev/null +WS_IMAGE_AFTER="$(kubectl -n "$NAMESPACE" get deployment orderbooks-ws-recorder -o jsonpath='{.spec.template.spec.containers[0].image}')" +WS_READY_AFTER="$(kubectl -n "$NAMESPACE" get deployment orderbooks-ws-recorder -o jsonpath='{.status.readyReplicas}/{.spec.replicas}')" +REST_IMAGE_AFTER="$(kubectl -n "$NAMESPACE" get deployment orderbooks-collector -o jsonpath='{.spec.template.spec.containers[0].image}')" +REST_READY_AFTER="$(kubectl -n "$NAMESPACE" get deployment orderbooks-collector -o jsonpath='{.status.readyReplicas}/{.spec.replicas}')" + +WRITE_EVIDENCE_PY="$TMPDIR/write-evidence.py" +cat >"$WRITE_EVIDENCE_PY" <<'PY_WRITE' +import datetime as dt, json, sys +from pathlib import Path +(path, run_id, git_ref, image, build_job, ws_image_after, ws_ready_after, rest_image_before, rest_ready_before, rest_image_after, rest_ready_after)=sys.argv[1:12] +manifest={ + 'schema_name':'ws_canary_deploy_evidence', + 'schema_version':1, + 'run_id':run_id, + 'written_at_utc':dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00','Z'), + 'mode':'live_canary_deploy', + 'status':'PASS', + 'git_ref':git_ref, + 'image':image, + 'build_job':build_job, + 'build_log_tail':sys.stdin.read()[-6000:], + 'resources_applied':['namespace.yaml','configmap.yaml','pvc.yaml','cronjob-uploader.yaml','deployment-ws-recorder.yaml'], + 'deployment_collector_applied':False, + 'ws_recorder':{'image_after':ws_image_after,'ready_after':ws_ready_after}, + 'rest_collector':{'image_before':rest_image_before,'ready_before':rest_ready_before,'image_after':rest_image_after,'ready_after':rest_ready_after,'unchanged':rest_image_before==rest_image_after and rest_ready_before==rest_ready_after}, +} +Path(path).write_text(json.dumps(manifest, indent=2, sort_keys=True)+'\n') +PY_WRITE +printf '%s' "$BUILD_LOG_TAIL" | python3 "$WRITE_EVIDENCE_PY" "$OUTPUT_PATH" "$RUN_ID" "$GIT_REF" "$IMAGE" "$BUILD_JOB" "$WS_IMAGE_AFTER" "$WS_READY_AFTER" "$REST_IMAGE_BEFORE" "$REST_READY_BEFORE" "$REST_IMAGE_AFTER" "$REST_READY_AFTER" + +echo "WS_CANARY_DEPLOY_EVIDENCE=$OUTPUT_PATH" +echo "WS_CANARY_IMAGE=$IMAGE" +echo "WS_CANARY_DEPLOY=PASS" diff --git a/scripts/k8s_ws_runtime_smoke_check.sh b/scripts/k8s_ws_runtime_smoke_check.sh new file mode 100755 index 0000000..4a6a3b1 --- /dev/null +++ b/scripts/k8s_ws_runtime_smoke_check.sh @@ -0,0 +1,421 @@ +#!/usr/bin/env bash +set -euo pipefail + +NAMESPACE="${ORDERBOOKS_K8S_NAMESPACE:-orderbooks}" +WS_DEPLOYMENT="${ORDERBOOKS_WS_DEPLOYMENT:-orderbooks-ws-recorder}" +REST_DEPLOYMENT="${ORDERBOOKS_REST_DEPLOYMENT:-orderbooks-collector}" +UPLOADER_CRONJOB="${ORDERBOOKS_UPLOADER_CRONJOB:-orderbooks-uploader}" +WAIT_SECONDS="${ORDERBOOKS_K8S_WS_SMOKE_WAIT_SECONDS:-900}" +OUTPUT_PATH="" +RAW_DIR="/var/lib/orderbooks/raw_orderbooks" +MANIFEST_DIR="/var/lib/orderbooks/manifests" +UPLOAD_MIN_AGE_SECONDS="600" +SMOKE_UPLOAD_RETENTION_DAYS="3" + +usage() { + cat <<'EOF' +Usage: scripts/k8s_ws_runtime_smoke_check.sh [options] + +Verifies the Kubernetes websocket recorder canary and writes compact local JSON +evidence. The script does not print secret contents. + +Options: + --namespace NAME Namespace. Default: orderbooks. + --deployment NAME Websocket recorder Deployment. Default: orderbooks-ws-recorder. + --rest-deployment NAME Existing REST Deployment. Default: orderbooks-collector. + --cronjob NAME Production uploader CronJob. Default: orderbooks-uploader. + --wait-seconds N Max wait for runtime evidence. Default: 900. + --output PATH Local smoke evidence path. + --raw-dir PATH In-pod raw root. Default: /var/lib/orderbooks/raw_orderbooks. + --manifest-dir PATH In-pod manifest dir. Default: /var/lib/orderbooks/manifests. + --upload-min-age-seconds N Production upload min age to record. Default: 600. + --smoke-upload-retention-days N Retention used by one-off smoke upload Job. Default: 3. + --help Show help. +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --namespace) NAMESPACE="$2"; shift 2 ;; + --deployment) WS_DEPLOYMENT="$2"; shift 2 ;; + --rest-deployment) REST_DEPLOYMENT="$2"; shift 2 ;; + --cronjob) UPLOADER_CRONJOB="$2"; shift 2 ;; + --wait-seconds) WAIT_SECONDS="$2"; shift 2 ;; + --output) OUTPUT_PATH="$2"; shift 2 ;; + --raw-dir) RAW_DIR="$2"; shift 2 ;; + --manifest-dir) MANIFEST_DIR="$2"; shift 2 ;; + --upload-min-age-seconds) UPLOAD_MIN_AGE_SECONDS="$2"; shift 2 ;; + --smoke-upload-retention-days) SMOKE_UPLOAD_RETENTION_DAYS="$2"; shift 2 ;; + --help) usage; exit 0 ;; + *) echo "unknown argument: $1" >&2; usage >&2; exit 2 ;; + esac +done + +command -v kubectl >/dev/null 2>&1 || { echo "kubectl is required" >&2; exit 2; } +RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)" +if [[ -z "$OUTPUT_PATH" ]]; then + OUTPUT_PATH="data/manifests/k8s_ws_runtime_smoke_${RUN_ID}.json" +fi +mkdir -p "$(dirname "$OUTPUT_PATH")" +TMPDIR="$(mktemp -d)" +trap 'rm -rf "$TMPDIR"' EXIT + +write_blocked() { + local gate="$1" + local reason="$2" + python3 - "$OUTPUT_PATH" "$gate" "$reason" <<'PY_BLOCKED' +import datetime as dt +import json +import sys +from pathlib import Path +path=Path(sys.argv[1]) +manifest={ + "schema_name":"k8s_ws_runtime_smoke", + "schema_version":1, + "written_at_utc":dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00','Z'), + "gate_status":sys.argv[2], + "reason":sys.argv[3], + "production_ready":False, +} +path.write_text(json.dumps(manifest,indent=2,sort_keys=True)+'\n') +PY_BLOCKED +} + +pod_for_deployment() { + local deployment="$1" + local selector + selector="$(kubectl -n "$NAMESPACE" get deployment "$deployment" -o jsonpath='{range $k,$v:=.spec.selector.matchLabels}{$k}{"="}{$v}{","}{end}' | sed 's/,$//')" + kubectl -n "$NAMESPACE" get pod -l "$selector" -o jsonpath='{.items[?(@.status.phase=="Running")].metadata.name}' | awk '{print $1}' +} + +REST_IMAGE_BEFORE="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.spec.template.spec.containers[0].image}')" +REST_READY_BEFORE="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.status.readyReplicas}/{.spec.replicas}')" + +kubectl -n "$NAMESPACE" rollout status "deployment/${REST_DEPLOYMENT}" --timeout=120s >/dev/null +kubectl -n "$NAMESPACE" rollout status "deployment/${WS_DEPLOYMENT}" --timeout=300s >/dev/null +REST_POD="$(pod_for_deployment "$REST_DEPLOYMENT")" +WS_POD="$(pod_for_deployment "$WS_DEPLOYMENT")" +if [[ -z "$REST_POD" || -z "$WS_POD" ]]; then + write_blocked "BLOCKED_K8S_RUNTIME_FAILURE" "missing running REST or websocket pod" + exit 1 +fi + +SUMMARY_PY="${TMPDIR}/summary.py" +cat >"${SUMMARY_PY}" <<'PY_SUMMARY' +import gzip, hashlib, json, os +from pathlib import Path +raw_root=Path(os.environ.get('RAW_DIR','/var/lib/orderbooks/raw_orderbooks')) +manifest_dir=Path(os.environ.get('MANIFEST_DIR','/var/lib/orderbooks/manifests')) +check_path=os.environ.get('CHECK_PATH') or '' + +def sha(path): + h=hashlib.sha256() + with path.open('rb') as f: + for chunk in iter(lambda:f.read(1024*1024), b''): + h.update(chunk) + return h.hexdigest() + +def count_gz(path): + rows=0 + first=None + with gzip.open(path,'rt',encoding='utf-8') as f: + for line in f: + if line.strip(): + obj=json.loads(line) + if first is None: + first=obj + rows+=1 + return rows, first + +def summarize_gz(path): + rows, first = count_gz(path) + return {'path':str(path),'rows':rows,'bytes':path.stat().st_size,'sha256':sha(path),'first_schema':first.get('schema_name') if isinstance(first,dict) else None} + +def load_json(path): + return json.loads(path.read_text()) +ws_files=sorted(raw_root.glob('polymarket/ws_raw/**/*.jsonl.gz')) +rest_files=sorted(raw_root.glob('polymarket/rest_checkpoints/**/*.jsonl.gz')) +open_files=sorted([p for p in raw_root.glob('polymarket/**/*') if p.is_file() and (p.name.startswith('.') or p.name.endswith(('.open','.tmp','.partial')))]) +recorder_manifests=sorted(manifest_dir.glob('polymarket_ws_recorder_*.json'), key=lambda p: p.stat().st_mtime) +upload_manifests=sorted(manifest_dir.glob('upload_archive_*.json'), key=lambda p: p.stat().st_mtime) +latest_manifest=load_json(recorder_manifests[-1]) if recorder_manifests else None +latest_upload=load_json(upload_manifests[-1]) if upload_manifests else None +result={ + 'ws_file_count':len(ws_files), + 'rest_file_count':len(rest_files), + 'open_or_temp_files':[str(p) for p in open_files[:20]], + 'open_or_temp_file_count':len(open_files), + 'recorder_manifest_count':len(recorder_manifests), + 'upload_manifest_count':len(upload_manifests), + 'latest_manifest_path':str(recorder_manifests[-1]) if recorder_manifests else None, + 'latest_upload_manifest_path':str(upload_manifests[-1]) if upload_manifests else None, + 'latest_manifest':latest_manifest, + 'latest_upload_manifest':latest_upload, +} +if ws_files: + result['latest_ws']=summarize_gz(ws_files[-1]) +if rest_files: + result['latest_rest']=summarize_gz(rest_files[-1]) +if check_path: + p=Path(check_path) + result['specific_file']={'path':check_path,'exists':p.exists()} + if p.exists(): + result['specific_file'].update(summarize_gz(p)) +print(json.dumps(result, sort_keys=True)) +PY_SUMMARY + +summarize_pod() { + local pod="$1" + local check_path="${2:-}" + kubectl -n "$NAMESPACE" exec "$pod" -- env RAW_DIR="$RAW_DIR" MANIFEST_DIR="$MANIFEST_DIR" CHECK_PATH="$check_path" python3 -c "$(cat "$SUMMARY_PY")" +} + +WAIT_PY="${TMPDIR}/wait_condition.py" +cat >"${WAIT_PY}" <<'PY_WAIT' +import json, sys +mode=sys.argv[1] +old_run=sys.argv[2] if len(sys.argv) > 2 else '' +o=json.loads(sys.stdin.read()) +manifest=o.get('latest_manifest') or {} +counters=manifest.get('counters') or {} +if mode == 'initial': + if counters.get('websocket_message_count',0) > 0 and counters.get('rest_success_count',0) > 0: + raise SystemExit(0) +elif mode == 'post_restart': + if manifest.get('run_id') and manifest.get('run_id') != old_run and counters.get('websocket_message_count',0) > 0: + raise SystemExit(0) +raise SystemExit(1) +PY_WAIT + +initial_json="" +end=$((SECONDS + WAIT_SECONDS)) +while [[ $SECONDS -lt $end ]]; do + initial_json="$(summarize_pod "$WS_POD")" + if python3 "$WAIT_PY" initial "" <<<"$initial_json"; then + break + fi + sleep 15 +done +if ! python3 "$WAIT_PY" initial "" <<<"$initial_json"; then + write_blocked "BLOCKED_WS_RECORDER_RUNTIME" "websocket recorder did not expose initial websocket and REST counters before timeout" + exit 1 +fi + +old_run_id="$(python3 -c 'import json,sys; o=json.loads(sys.stdin.read()); print((o.get("latest_manifest") or {}).get("run_id") or "")' <<<"$initial_json")" +old_ws_path="$(python3 -c 'import json,sys; o=json.loads(sys.stdin.read()); print((o.get("latest_ws") or {}).get("path") or "")' <<<"$initial_json")" +old_ws_sha="$(python3 -c 'import json,sys; o=json.loads(sys.stdin.read()); print((o.get("latest_ws") or {}).get("sha256") or "")' <<<"$initial_json")" +old_ws_rows="$(python3 -c 'import json,sys; o=json.loads(sys.stdin.read()); print((o.get("latest_ws") or {}).get("rows") or 0)' <<<"$initial_json")" + +kubectl -n "$NAMESPACE" delete pod "$WS_POD" --wait=true >/dev/null +kubectl -n "$NAMESPACE" rollout status "deployment/${WS_DEPLOYMENT}" --timeout=300s >/dev/null +NEW_WS_POD="$(pod_for_deployment "$WS_DEPLOYMENT")" +if [[ -z "$NEW_WS_POD" ]]; then + write_blocked "BLOCKED_WS_RECORDER_RUNTIME" "websocket pod did not return after restart" + exit 1 +fi + +restart_json="$(summarize_pod "$NEW_WS_POD" "$old_ws_path")" +post_json="" +end=$((SECONDS + WAIT_SECONDS)) +while [[ $SECONDS -lt $end ]]; do + post_json="$(summarize_pod "$NEW_WS_POD" "$old_ws_path")" + if python3 "$WAIT_PY" post_restart "$old_run_id" <<<"$post_json"; then + break + fi + sleep 15 +done +if ! python3 "$WAIT_PY" post_restart "$old_run_id" <<<"$post_json"; then + write_blocked "BLOCKED_WS_RECORDER_RUNTIME" "new websocket pod did not write post-restart manifest evidence before timeout" + exit 1 +fi + +UPLOADER_IMAGE="$(kubectl -n "$NAMESPACE" get cronjob "$UPLOADER_CRONJOB" -o jsonpath='{.spec.jobTemplate.spec.template.spec.containers[0].image}')" +JOB_NAME="orderbooks-ws-smoke-upload-${RUN_ID,,}" +JOB_NAME="${JOB_NAME//_/-}" +cat >"${TMPDIR}/upload-job.yaml" </dev/null +kubectl -n "$NAMESPACE" wait --for=condition=Complete --timeout=900s "job/${JOB_NAME}" >/dev/null || true +JOB_STATUS="$(kubectl -n "$NAMESPACE" get job "$JOB_NAME" -o jsonpath='{.status.conditions[-1:].type}' 2>/dev/null || true)" +JOB_LOG_TAIL="$(kubectl -n "$NAMESPACE" logs "job/${JOB_NAME}" --tail=80 2>/dev/null || true)" +upload_json="$(summarize_pod "$NEW_WS_POD" "$old_ws_path")" + +REST_IMAGE_AFTER="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.spec.template.spec.containers[0].image}')" +REST_READY_AFTER="$(kubectl -n "$NAMESPACE" get deployment "$REST_DEPLOYMENT" -o jsonpath='{.status.readyReplicas}/{.spec.replicas}')" + +WRITE_PY="${TMPDIR}/write_evidence.py" +cat >"${WRITE_PY}" <<'PY_WRITE' +import datetime as dt, json, sys +from pathlib import Path +(output_path, namespace, ws_deployment, rest_deployment, uploader_cronjob, + rest_image_before, rest_ready_before, rest_image_after, rest_ready_after, + ws_pod_before, ws_pod_after, old_ws_path, old_ws_sha, old_ws_rows, + job_name, job_status, production_min_age, smoke_retention_days, uploader_image) = sys.argv[1:20] +text=sys.stdin.read() +parts=text.split('\n---PART---\n') +initial=json.loads(parts[0]) +restart=json.loads(parts[1]) +post=json.loads(parts[2]) +upload=json.loads(parts[3]) +job_log_tail=parts[4] +reasons=[] +old_ws_rows=int(old_ws_rows or 0) +old_file_preexisting=bool(old_ws_path) +if old_file_preexisting: + specific=(post.get('specific_file') or {}) + if not specific.get('exists') or specific.get('sha256') != old_ws_sha or int(specific.get('rows') or 0) != old_ws_rows: + reasons.append('pre-existing closed websocket file changed or failed parse after restart') +else: + latest_after=(restart.get('latest_ws') or post.get('latest_ws') or {}) + if int(latest_after.get('rows') or 0) <= 0: + reasons.append('SIGTERM did not produce a parseable closed websocket archive') +post_manifest=post.get('latest_manifest') or {} +initial_manifest=initial.get('latest_manifest') or {} +post_counters=post_manifest.get('counters') or {} +if not post_manifest.get('run_id') or post_manifest.get('run_id') == initial_manifest.get('run_id'): + reasons.append('post-restart recorder manifest did not come from a new run') +if int(post_counters.get('websocket_message_count') or 0) <= 0: + reasons.append('post-restart recorder did not write websocket message evidence') +if not upload.get('latest_ws') or int((upload.get('latest_ws') or {}).get('rows') or 0) <= 0: + reasons.append('websocket gzip evidence missing or empty') +if not upload.get('latest_rest') or int((upload.get('latest_rest') or {}).get('rows') or 0) <= 0: + reasons.append('REST checkpoint gzip evidence missing or empty') +if rest_image_before != rest_image_after: + reasons.append('REST collector image changed') +if rest_ready_after != rest_ready_before: + reasons.append('REST collector readiness changed') +if job_status != 'Complete': + reasons.append('smoke uploader job did not complete') +upload_manifest=upload.get('latest_upload_manifest') or {} +verified_files=upload_manifest.get('verified_files') or [] +skipped_files=upload_manifest.get('skipped_files') or [] +deleted_files=upload_manifest.get('deleted_local_files') or [] +retained_files=upload_manifest.get('retained_local_files') or [] +verified_paths={item.get('relative_path') for item in verified_files} +verified_ws_or_rest=[item for item in verified_files if 'polymarket/ws_raw/' in str(item.get('relative_path')) or 'polymarket/rest_checkpoints/' in str(item.get('relative_path'))] +open_count=max(initial.get('open_or_temp_file_count') or 0, post.get('open_or_temp_file_count') or 0, upload.get('open_or_temp_file_count') or 0) +skipped_open=[item for item in skipped_files if item.get('reason') == 'open_or_temporary_file'] +unsafe_deletes=[item for item in deleted_files if item.get('relative_path') not in verified_paths] +if upload_manifest.get('gate_status') != 'PASS' or upload_manifest.get('operation_status') != 'UPLOAD_VERIFIED': + reasons.append('upload manifest did not prove verified upload') +if int((upload_manifest.get('counts') or {}).get('verified') or 0) <= 0: + reasons.append('upload manifest verified count was zero') +if not verified_ws_or_rest: + reasons.append('upload manifest did not verify websocket recorder raw/checkpoint files') +if open_count > 0 and not skipped_open: + reasons.append('open/temp files existed but were not recorded as skipped') +if unsafe_deletes: + reasons.append('cleanup deleted files not present in verified set') +gate='WS_RECORDER_K8S_SMOKE_PASS' if not reasons else ('BLOCKED_WS_RECORDER_UPLOAD_OR_RETENTION' if any('upload' in r or 'cleanup' in r or 'open/temp' in r for r in reasons) else 'BLOCKED_WS_RECORDER_RUNTIME') +manifest={ + 'schema_name':'k8s_ws_runtime_smoke', + 'schema_version':1, + 'written_at_utc':dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00','Z'), + 'gate_status':gate, + 'namespace':namespace, + 'deployments':{'ws':ws_deployment,'rest':rest_deployment}, + 'uploader_cronjob':uploader_cronjob, + 'uploader_image':uploader_image, + 'pods':{'ws_before':ws_pod_before,'ws_after':ws_pod_after}, + 'rest_collector':{'image_before':rest_image_before,'ready_before':rest_ready_before,'image_after':rest_image_after,'ready_after':rest_ready_after,'unchanged':rest_image_before == rest_image_after and rest_ready_before == rest_ready_after}, + 'restart_check':{'old_file_preexisting':old_file_preexisting,'old_ws_file':{'path':old_ws_path,'sha256':old_ws_sha,'rows':old_ws_rows},'restart_summary':restart.get('specific_file')}, + 'initial':initial, + 'post_restart':post, + 'upload':upload, + 'uploader_job':{'name':job_name,'status':job_status,'log_tail':job_log_tail[-4000:],'production_min_age_seconds':int(production_min_age),'smoke_min_age_seconds':0,'smoke_retention_days':int(smoke_retention_days)}, + 'upload_manifest_summary':{'path':upload.get('latest_upload_manifest_path'),'gate_status':upload_manifest.get('gate_status'),'operation_status':upload_manifest.get('operation_status'),'counts':upload_manifest.get('counts'),'verified_ws_or_rest_count':len(verified_ws_or_rest),'skipped_open_or_temp_count':len(skipped_open),'deleted_count':len(deleted_files),'retained_count':len(retained_files),'unsafe_delete_count':len(unsafe_deletes)}, + 'reasons':reasons, + 'production_ready':False, +} +path=Path(output_path) +path.write_text(json.dumps(manifest, indent=2, sort_keys=True)+'\n') +print(json.dumps({'gate_status':gate,'evidence_path':str(path),'reasons':reasons}, indent=2, sort_keys=True)) +raise SystemExit(0 if gate == 'WS_RECORDER_K8S_SMOKE_PASS' else 1) +PY_WRITE +printf '%s\n---PART---\n%s\n---PART---\n%s\n---PART---\n%s\n---PART---\n%s' "$initial_json" "$restart_json" "$post_json" "$upload_json" "$JOB_LOG_TAIL" | python3 "$WRITE_PY" "$OUTPUT_PATH" "$NAMESPACE" "$WS_DEPLOYMENT" "$REST_DEPLOYMENT" "$UPLOADER_CRONJOB" "$REST_IMAGE_BEFORE" "$REST_READY_BEFORE" "$REST_IMAGE_AFTER" "$REST_READY_AFTER" "$WS_POD" "$NEW_WS_POD" "$old_ws_path" "$old_ws_sha" "$old_ws_rows" "$JOB_NAME" "$JOB_STATUS" "$UPLOAD_MIN_AGE_SECONDS" "$SMOKE_UPLOAD_RETENTION_DAYS" "$UPLOADER_IMAGE" diff --git a/scripts/reconstruct_polymarket_ws_books.py b/scripts/reconstruct_polymarket_ws_books.py new file mode 100755 index 0000000..efd3a8d --- /dev/null +++ b/scripts/reconstruct_polymarket_ws_books.py @@ -0,0 +1,685 @@ +#!/usr/bin/env python3 +"""Offline Polymarket websocket book reconstruction and REST comparison. + +Checkpoint 10C scope: read raw 10B sample files, derive local per-token order +book state, and compare against REST /books checkpoints. Raw files remain the +source of truth and are not modified. +""" + +from __future__ import annotations + +import argparse +import datetime as dt +import gzip +import hashlib +import json +from copy import deepcopy +from decimal import Decimal, InvalidOperation +from pathlib import Path +from typing import Any + + +RECONSTRUCTOR_NAME = "polymarket_ws_book_reconstructor" +RECONSTRUCTOR_VERSION = "0.1.1" +DEFAULT_INPUT_MANIFEST = Path("data/manifests/checkpoint_010b_ws_raw_sample.json") +DEFAULT_MANIFEST_PATH = Path("data/manifests/checkpoint_010c_book_reconstruction_sample.json") +DEFAULT_REPORT_PATH = Path("reports/checkpoints/checkpoint_010c_book_reconstruction_sample.md") +DEFAULT_SCHEMA_DOC = Path("docs/POLYMARKET_WEBSOCKET_SCHEMA.md") +DEFAULT_RECON_DOC = Path("docs/BOOK_RECONSTRUCTION.md") +DEFAULT_OUTPUT_ROOT = Path("data/reconstruction_sample") + + +class BookState: + def __init__(self, token_meta: dict[str, Any]) -> None: + self.token_meta = token_meta + self.bids: dict[str, Decimal] = {} + self.asks: dict[str, Decimal] = {} + self.initialized = False + self.messages_applied = 0 + self.messages_skipped = 0 + self.unknown_messages = 0 + self.last_update_received_at_utc: str | None = None + self.book_message_count = 0 + self.price_change_count = 0 + self.best_bid_ask_count = 0 + self.last_trade_price_count = 0 + self.warnings: list[str] = [] + + def clone_summary(self, top_n: int) -> dict[str, Any]: + bids = sorted(self.bids.items(), key=lambda item: Decimal(item[0]), reverse=True) + asks = sorted(self.asks.items(), key=lambda item: Decimal(item[0])) + best_bid = bids[0][0] if bids else None + best_ask = asks[0][0] if asks else None + spread = dec_to_str(Decimal(best_ask) - Decimal(best_bid)) if best_bid and best_ask else None + return { + "token": self.token_meta, + "initialized": self.initialized, + "messages_applied": self.messages_applied, + "messages_skipped": self.messages_skipped, + "unknown_messages": self.unknown_messages, + "last_update_received_at_utc": self.last_update_received_at_utc, + "state_quality": self.state_quality(), + "bid_level_count": len(self.bids), + "ask_level_count": len(self.asks), + "best_bid": best_bid, + "best_ask": best_ask, + "spread": spread, + "top_bids": [{"price": price, "size": dec_to_str(size)} for price, size in bids[:top_n]], + "top_asks": [{"price": price, "size": dec_to_str(size)} for price, size in asks[:top_n]], + "event_counters": { + "book": self.book_message_count, + "price_change": self.price_change_count, + "best_bid_ask": self.best_bid_ask_count, + "last_trade_price": self.last_trade_price_count, + }, + "warnings": self.warnings, + } + + def state_quality(self) -> str: + if not self.initialized: + return "insufficient_events" + if self.price_change_count > 0: + return "initialized_and_updated" + return "snapshot_only" + + +def utc_now() -> dt.datetime: + return dt.datetime.now(dt.UTC) + + +def iso_z(value: dt.datetime | None = None) -> str: + value = value or utc_now() + return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def parse_iso(value: str | None) -> dt.datetime | None: + if not value: + return None + text = value[:-1] + "+00:00" if value.endswith("Z") else value + try: + parsed = dt.datetime.fromisoformat(text) + except ValueError: + return None + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=dt.UTC) + return parsed.astimezone(dt.UTC) + + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def dec(value: Any) -> Decimal: + if value is None: + return Decimal("0") + try: + return Decimal(str(value)) + except InvalidOperation: + return Decimal("0") + + +def dec_to_str(value: Decimal) -> str: + text = format(value, "f") + if "." in text: + text = text.rstrip("0").rstrip(".") + return text or "0" + + +def level_map(levels: Any) -> dict[str, Decimal]: + result: dict[str, Decimal] = {} + if not isinstance(levels, list): + return result + for item in levels: + if not isinstance(item, dict): + continue + price = str(item.get("price")) + size = dec(item.get("size")) + result[dec_to_str(dec(price))] = size + return result + + +def classify_item(item: Any) -> str: + if not isinstance(item, dict): + return type(item).__name__ + event_type = item.get("event_type") + if event_type: + return str(event_type) + if {"market", "asset_id", "bids", "asks", "timestamp"}.issubset(item.keys()): + return "book_without_event_type" + return "unknown_object" + + +def raw_event_items(row: dict[str, Any]) -> list[dict[str, Any]]: + payload = row.get("json") + items = payload if isinstance(payload, list) else [payload] + return [item for item in items if isinstance(item, dict)] + + +def apply_book_item(state: BookState, item: dict[str, Any], received_at_utc: str) -> None: + state.bids = level_map(item.get("bids")) + state.asks = level_map(item.get("asks")) + state.initialized = True + state.messages_applied += 1 + state.book_message_count += 1 + state.last_update_received_at_utc = received_at_utc + + +def apply_price_change(state_by_token: dict[str, BookState], item: dict[str, Any], received_at_utc: str, warnings: list[str]) -> None: + changes = item.get("price_changes") + if not isinstance(changes, list): + warnings.append("price_change event without price_changes list") + return + for change in changes: + if not isinstance(change, dict): + continue + token_id = str(change.get("asset_id") or "") + state = state_by_token.get(token_id) + if state is None: + continue + if not state.initialized: + state.messages_skipped += 1 + state.warnings.append("price_change skipped before initial book snapshot") + continue + side = str(change.get("side") or "").upper() + price = dec_to_str(dec(change.get("price"))) + size = dec(change.get("size")) + if side == "BUY": + book_side = state.bids + elif side == "SELL": + book_side = state.asks + else: + state.messages_skipped += 1 + state.warnings.append(f"unsupported price_change side {side!r}") + continue + if size == 0: + book_side.pop(price, None) + else: + book_side[price] = size + state.messages_applied += 1 + state.price_change_count += 1 + state.last_update_received_at_utc = received_at_utc + + +def apply_ws_row( + row: dict[str, Any], + state_by_token: dict[str, BookState], + event_type_counts: dict[str, int], + unsupported_counts: dict[str, int], + warnings: list[str], +) -> None: + received_at_utc = row.get("received_at_utc") + for item in raw_event_items(row): + event_type = classify_item(item) + event_type_counts[event_type] = event_type_counts.get(event_type, 0) + 1 + if event_type in {"book", "book_without_event_type"}: + token_id = str(item.get("asset_id") or "") + state = state_by_token.get(token_id) + if state is None: + unsupported_counts["book_for_untracked_token"] = unsupported_counts.get("book_for_untracked_token", 0) + 1 + continue + apply_book_item(state, item, received_at_utc) + elif event_type == "price_change": + apply_price_change(state_by_token, item, received_at_utc, warnings) + elif event_type == "best_bid_ask": + token_id = str(item.get("asset_id") or "") + state = state_by_token.get(token_id) + if state: + state.best_bid_ask_count += 1 + state.messages_skipped += 1 + unsupported_counts[event_type] = unsupported_counts.get(event_type, 0) + 1 + elif event_type == "last_trade_price": + token_id = str(item.get("asset_id") or "") + state = state_by_token.get(token_id) + if state: + state.last_trade_price_count += 1 + state.messages_skipped += 1 + unsupported_counts[event_type] = unsupported_counts.get(event_type, 0) + 1 + else: + unsupported_counts[event_type] = unsupported_counts.get(event_type, 0) + 1 + for state in state_by_token.values(): + state.unknown_messages += 1 + + +def top_levels(book: dict[str, Decimal], side: str, top_n: int) -> list[tuple[str, Decimal]]: + reverse = side == "bids" + return sorted(book.items(), key=lambda item: Decimal(item[0]), reverse=reverse)[:top_n] + + +def rest_book_from_item(item: dict[str, Any]) -> dict[str, Any]: + bids = level_map(item.get("bids")) + asks = level_map(item.get("asks")) + return {"bids": bids, "asks": asks} + + +def book_summary_from_maps(bids: dict[str, Decimal], asks: dict[str, Decimal], top_n: int) -> dict[str, Any]: + bid_levels = top_levels(bids, "bids", top_n) + ask_levels = top_levels(asks, "asks", top_n) + best_bid = bid_levels[0][0] if bid_levels else None + best_ask = ask_levels[0][0] if ask_levels else None + spread = dec_to_str(Decimal(best_ask) - Decimal(best_bid)) if best_bid and best_ask else None + return { + "best_bid": best_bid, + "best_ask": best_ask, + "spread": spread, + "bid_level_count": len(bids), + "ask_level_count": len(asks), + "top_bids": [{"price": price, "size": dec_to_str(size)} for price, size in bid_levels], + "top_asks": [{"price": price, "size": dec_to_str(size)} for price, size in ask_levels], + } + + +def compare_side(local: dict[str, Decimal], rest: dict[str, Decimal], side: str, top_n: int) -> dict[str, Any]: + local_top = dict(top_levels(local, side, top_n)) + rest_top = dict(top_levels(rest, side, top_n)) + missing = sorted(set(rest_top) - set(local_top), key=Decimal, reverse=(side == "bids")) + extra = sorted(set(local_top) - set(rest_top), key=Decimal, reverse=(side == "bids")) + size_deltas = [] + for price in sorted(set(local_top) & set(rest_top), key=Decimal, reverse=(side == "bids")): + delta = local_top[price] - rest_top[price] + if delta != 0: + size_deltas.append({"price": price, "local_size": dec_to_str(local_top[price]), "rest_size": dec_to_str(rest_top[price]), "delta": dec_to_str(delta)}) + return {"missing_prices": missing, "extra_prices": extra, "size_deltas": size_deltas} + + +def compare_books(state: BookState, rest_item: dict[str, Any], top_n: int) -> dict[str, Any]: + rest_maps = rest_book_from_item(rest_item) + local_summary = book_summary_from_maps(state.bids, state.asks, top_n) + rest_summary = book_summary_from_maps(rest_maps["bids"], rest_maps["asks"], top_n) + bid_cmp = compare_side(state.bids, rest_maps["bids"], "bids", top_n) + ask_cmp = compare_side(state.asks, rest_maps["asks"], "asks", top_n) + best_match = local_summary["best_bid"] == rest_summary["best_bid"] and local_summary["best_ask"] == rest_summary["best_ask"] + top_match = not bid_cmp["missing_prices"] and not bid_cmp["extra_prices"] and not bid_cmp["size_deltas"] and not ask_cmp["missing_prices"] and not ask_cmp["extra_prices"] and not ask_cmp["size_deltas"] + return { + "comparison_status": "match" if best_match and top_match else "divergent", + "best_bid_match": local_summary["best_bid"] == rest_summary["best_bid"], + "best_ask_match": local_summary["best_ask"] == rest_summary["best_ask"], + "spread_match": local_summary["spread"] == rest_summary["spread"], + "level_count_match": local_summary["bid_level_count"] == rest_summary["bid_level_count"] and local_summary["ask_level_count"] == rest_summary["ask_level_count"], + "local": local_summary, + "rest": rest_summary, + "bid_top_n_diff": bid_cmp, + "ask_top_n_diff": ask_cmp, + } + + +def read_gzip_jsonl(path: Path) -> list[tuple[int, dict[str, Any]]]: + rows: list[tuple[int, dict[str, Any]]] = [] + with gzip.open(path, "rt", encoding="utf-8") as handle: + for line_number, line in enumerate(handle, 1): + if line.strip(): + rows.append((line_number, json.loads(line))) + return rows + + +def write_gzip_jsonl(path: Path, rows: list[dict[str, Any]]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with gzip.open(path, "wt", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, separators=(",", ":"), sort_keys=True) + "\n") + + +def summarize_file(path: Path, rows: int, kind: str) -> dict[str, Any]: + return {"path": path.as_posix(), "kind": kind, "bytes": path.stat().st_size, "rows": rows, "sha256": sha256_file(path), "status": "valid" if path.exists() and path.stat().st_size else "missing_or_empty"} + + +def write_schema_docs(path: Path, schema_summary: dict[str, Any]) -> None: + lines = [ + "# Polymarket Websocket Schema Observed In Checkpoint 10B", + "", + "This document summarizes observed public market websocket message shapes from the bounded 10B BTC sample. It does not include full raw payload dumps; raw payloads remain in the gzip JSONL sample files.", + "", + "## Observed Event Types", + "", + ] + for event_type, info in sorted(schema_summary.items()): + lines.extend([ + f"### {event_type}", + "", + f"Count: `{info['count']}`", + "", + f"Observed top-level fields: `{', '.join(info['fields'])}`", + "", + ]) + if info.get("level_fields"): + lines.extend([f"Nested level/change fields: `{', '.join(info['level_fields'])}`", ""]) + lines.append(info.get("notes") or "No additional notes.") + lines.append("") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines), encoding="utf-8") + + +def write_reconstruction_doc(path: Path) -> None: + lines = [ + "# Book Reconstruction Method", + "", + "Checkpoint 10C reconstructs order-book state from raw Polymarket market websocket messages captured in Checkpoint 10B.", + "", + "## Source Of Truth", + "", + "Raw websocket and REST checkpoint gzip JSONL files are immutable source evidence. Reconstruction outputs are derived and reference the input file paths, line numbers, websocket message sequence spans, and REST checkpoint sequences.", + "", + "## Applied Events", + "", + "- `book` and `book_without_event_type` messages initialize or replace the full per-token bid/ask maps.", + "- `price_change` messages are applied after initialization. Observed `side=BUY` updates bids and `side=SELL` updates asks.", + "- Observed `size=0` is treated as level removal. Non-zero size replaces the level size at that price.", + "- `best_bid_ask`, `last_trade_price`, and unrelated `new_market` messages are preserved and counted but do not mutate the book map.", + "", + "## Comparison", + "", + "For each REST checkpoint, the reconstructor compares REST `/books` payloads with local websocket state after applying all websocket messages received at or before the REST checkpoint receive time. The comparison includes best bid, best ask, spread, bid/ask level counts, and top 10 levels by default.", + "", + "## Limits", + "", + "The sample is short and network timing can produce REST-vs-websocket divergences. Divergence rows include raw websocket and REST references so follow-up can inspect whether differences are timing, feed semantics, or reconstruction defects.", + "", + "## Checkpoint 10C Divergence Result", + "", + "The accepted 10C sample produced 20 REST comparison rows: 8 exact top-10 matches and 12 divergent rows. In every divergent row, best bid, best ask, spread, level counts, and top-N price membership matched. The observed divergences were size-only deltas within shared top-N price levels.", + "", + "Size-only divergence still matters. It can change depth, fillability assumptions, queue-size estimates, and any later answer about whether a hypothetical trade was observable and reproducible from the archived feed.", + "", + "This result is useful evidence for the websocket path, but it is not production readiness. The sample is bounded, the timing relationship between REST checkpoints and websocket delivery is imperfect, and long-running reconnect, stale-feed, rotation, upload, and alert behavior still need their own checkpoint before deployment.", + "", + ] + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines), encoding="utf-8") + + +def write_report(path: Path, manifest: dict[str, Any]) -> None: + lines = [ + "# Checkpoint 10C Book Reconstruction Sample", + "", + f"Status: {manifest['gate_status']} ", + f"Created: {manifest['ended_at_utc']} ", + "Production ready: no ", + "Live Kubernetes collector modified: no", + "", + "## Input", + "", + f"- 10B manifest: `{manifest['input']['manifest_path']}`.", + f"- 10B gate: `{manifest['input']['gate_status']}`.", + f"- Run id: `{manifest['run_id']}`.", + "", + "## Observed Websocket Events", + "", + f"Event type counts: `{json.dumps(manifest['event_type_counts'], sort_keys=True)}`.", + f"Unsupported/non-mutating counts: `{json.dumps(manifest['unsupported_event_counts'], sort_keys=True)}`.", + "", + "## Reconstruction Status", + "", + ] + for token_id, status in manifest["token_reconstruction_statuses"].items(): + meta = status.get("token", {}) + lines.append( + f"- `{token_id}` ({meta.get('market_slug')} {meta.get('outcome')}): `{status['state_quality']}`, initialized `{status['initialized']}`, applied `{status['messages_applied']}`, skipped `{status['messages_skipped']}`, unknown `{status['unknown_messages']}`." + ) + lines.extend([ + "", + "## REST Comparison", + "", + f"- Comparison rows: `{manifest['comparison_summary']['comparison_count']}`.", + f"- Matches: `{manifest['comparison_summary']['match_count']}`.", + f"- Divergences: `{manifest['comparison_summary']['divergent_count']}`.", + f"- No-state rows: `{manifest['comparison_summary']['no_state_count']}`.", + "", + "Divergence samples are preserved in the machine-readable manifest with raw websocket and REST references.", + "", + "## Output Files", + "", + ]) + for output in manifest["output_files"]: + lines.append(f"- `{output['path']}`: `{output['kind']}`, rows `{output['rows']}`, bytes `{output['bytes']}`, sha256 `{output['sha256']}`") + lines.extend(["", "## Assumptions And Warnings", ""]) + if manifest["warnings"]: + lines.extend(f"- {warning}" for warning in manifest["warnings"]) + else: + lines.append("- None.") + lines.extend([ + "", + "## Gate", + "", + manifest["gate_status"], + "", + "## Full-Fidelity Readiness Finding", + "", + manifest["readiness_finding"], + "", + "## Strongest Fake Progress Risk", + "", + "A reconstruction script can look correct while silently ignoring unsupported message semantics. This sample records unsupported event counts and comparison divergences with raw references so the next deployment step has audit evidence.", + "", + "## Next Smallest Step", + "", + "If combined 10B/10C passes, plan 10D: convert this proven sample path into a long-running Kubernetes websocket recorder with rotation, reconnect/stale-feed evidence, REST checkpoint recovery, upload cleanup, and migration from REST-only collection.", + "", + ]) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines), encoding="utf-8") + + +def run_reconstruction(args: argparse.Namespace) -> dict[str, Any]: + started_at_utc = iso_z() + input_manifest_path = args.input_manifest + input_manifest = json.loads(input_manifest_path.read_text(encoding="utf-8")) + if input_manifest.get("gate_status") != "WS_RAW_SAMPLE_PASS": + raise RuntimeError("10C requires 10B gate_status WS_RAW_SAMPLE_PASS") + run_id = input_manifest["run_id"] + ws_file = Path(next(item["path"] for item in input_manifest["output_files"] if item["kind"] == "raw_websocket_messages")) + rest_file = Path(next(item["path"] for item in input_manifest["output_files"] if item["kind"] == "rest_books_checkpoints")) + ws_rows = read_gzip_jsonl(ws_file) + rest_rows = read_gzip_jsonl(rest_file) + token_meta = {token["token_id"]: token for token in input_manifest["tokens_tracked"]} + state_by_token = {token_id: BookState(meta) for token_id, meta in token_meta.items()} + event_type_counts: dict[str, int] = {} + unsupported_counts: dict[str, int] = {} + warnings: list[str] = ["Observed price_change side semantics are assumed as BUY->bids and SELL->asks.", "Observed size=0 is treated as level removal."] + schema_summary: dict[str, dict[str, Any]] = {} + + def observe_schema(item: dict[str, Any], event_type: str) -> None: + info = schema_summary.setdefault(event_type, {"count": 0, "fields": set(), "level_fields": set(), "notes": ""}) + info["count"] += 1 + info["fields"].update(str(k) for k in item.keys()) + for key in ("bids", "asks", "price_changes"): + values = item.get(key) + if isinstance(values, list): + for nested in values[:20]: + if isinstance(nested, dict): + info["level_fields"].update(str(k) for k in nested.keys()) + notes = { + "book": "Full per-token book snapshot used to initialize or replace local state.", + "book_without_event_type": "Full per-token book snapshot without event_type; treated like book if observed.", + "price_change": "Incremental price/size updates applied after a token has an initialized book.", + "best_bid_ask": "Best quote summary; counted but not applied to level maps.", + "last_trade_price": "Trade print summary; counted but not applied to level maps.", + "new_market": "Market metadata broadcast; preserved and counted but unrelated to selected BTC token state in this sample.", + } + info["notes"] = notes.get(event_type, "Unsupported/unknown shape preserved and counted.") + + for _line, row in ws_rows: + for item in raw_event_items(row): + observe_schema(item, classify_item(item)) + for info in schema_summary.values(): + info["fields"] = sorted(info["fields"]) + info["level_fields"] = sorted(info["level_fields"]) + + comparisons: list[dict[str, Any]] = [] + state_snapshots: list[dict[str, Any]] = [] + ws_index = 0 + first_applied_seq: int | None = None + last_applied_seq: int | None = None + first_applied_line: int | None = None + last_applied_line: int | None = None + last_applied_received_at_utc: str | None = None + rest_sorted = sorted(rest_rows, key=lambda item: parse_iso(item[1].get("received_at_utc")) or dt.datetime.min.replace(tzinfo=dt.UTC)) + ws_sorted = sorted(ws_rows, key=lambda item: (parse_iso(item[1].get("received_at_utc")) or dt.datetime.min.replace(tzinfo=dt.UTC), item[1].get("global_message_sequence") or 0)) + for rest_line, rest_row in rest_sorted: + rest_time = parse_iso(rest_row.get("received_at_utc")) + while ws_index < len(ws_sorted): + ws_line, ws_row = ws_sorted[ws_index] + ws_time = parse_iso(ws_row.get("received_at_utc")) + if rest_time is not None and ws_time is not None and ws_time > rest_time: + break + apply_ws_row(ws_row, state_by_token, event_type_counts, unsupported_counts, warnings) + seq = ws_row.get("global_message_sequence") + if isinstance(seq, int): + first_applied_seq = seq if first_applied_seq is None else min(first_applied_seq, seq) + last_applied_seq = seq if last_applied_seq is None else max(last_applied_seq, seq) + first_applied_line = ws_line if first_applied_line is None else min(first_applied_line, ws_line) + last_applied_line = ws_line + last_applied_received_at_utc = ws_row.get("received_at_utc") + ws_index += 1 + rest_payload = (rest_row.get("response") or {}).get("raw_response_json") + if not isinstance(rest_payload, list): + warnings.append(f"REST checkpoint {rest_row.get('checkpoint_sequence')} payload was not a list") + continue + for rest_item in rest_payload: + if not isinstance(rest_item, dict): + continue + token_id = str(rest_item.get("asset_id") or "") + if token_id not in state_by_token: + continue + state = state_by_token[token_id] + base = { + "run_id": run_id, + "token_id": token_id, + "market": state.token_meta, + "rest_checkpoint_sequence": rest_row.get("checkpoint_sequence"), + "rest_checkpoint_received_at_utc": rest_row.get("received_at_utc"), + "rest_checkpoint_file": rest_file.as_posix(), + "rest_checkpoint_line": rest_line, + "raw_websocket_file": ws_file.as_posix(), + "applied_ws_message_count": ws_index, + "applied_ws_line_span": [first_applied_line, last_applied_line], + "applied_ws_global_sequence_span": [first_applied_seq, last_applied_seq], + "last_applied_ws_line": last_applied_line, + "last_applied_ws_received_at_utc": last_applied_received_at_utc, + "last_local_update_received_at_utc": state.last_update_received_at_utc, + "state_quality": state.state_quality(), + } + if not state.initialized: + comp = {**base, "comparison_status": "no_state", "reason": "token not initialized by preceding websocket book event"} + else: + comp = {**base, **compare_books(state, rest_item, args.top_n)} + comparisons.append(comp) + for token_id, state in state_by_token.items(): + state_snapshots.append({ + "run_id": run_id, + "snapshot_basis": "after_websocket_messages_preceding_rest_checkpoint", + "rest_checkpoint_sequence": rest_row.get("checkpoint_sequence"), + "rest_checkpoint_received_at_utc": rest_row.get("received_at_utc"), + "raw_websocket_file": ws_file.as_posix(), + "applied_ws_message_count": ws_index, + "applied_ws_line_span": [first_applied_line, last_applied_line], + "applied_ws_global_sequence_span": [first_applied_seq, last_applied_seq], + "last_applied_ws_line": last_applied_line, + "last_applied_ws_received_at_utc": last_applied_received_at_utc, + **state.clone_summary(args.top_n), + }) + + # Apply remaining websocket messages for final token statuses. + while ws_index < len(ws_sorted): + _ws_line, ws_row = ws_sorted[ws_index] + apply_ws_row(ws_row, state_by_token, event_type_counts, unsupported_counts, warnings) + ws_index += 1 + + output_dir = args.output_root / "polymarket" / "books" / run_id + comparison_dir = args.output_root / "polymarket" / "comparisons" / run_id + state_file = output_dir / f"polymarket_reconstructed_books_{run_id}.jsonl.gz" + comparison_file = comparison_dir / f"polymarket_rest_comparison_{run_id}.jsonl.gz" + write_gzip_jsonl(state_file, state_snapshots) + write_gzip_jsonl(comparison_file, comparisons) + + statuses = {token_id: state.clone_summary(args.top_n) for token_id, state in state_by_token.items()} + market_token_init: dict[str, list[bool]] = {} + for state in state_by_token.values(): + market_token_init.setdefault(str(state.token_meta.get("condition_id")), []).append(state.initialized) + any_market_both_initialized = any(len(values) >= 2 and all(values[:2]) for values in market_token_init.values()) + match_count = sum(1 for row in comparisons if row.get("comparison_status") == "match") + divergent_count = sum(1 for row in comparisons if row.get("comparison_status") == "divergent") + no_state_count = sum(1 for row in comparisons if row.get("comparison_status") == "no_state") + if not any_market_both_initialized: + gate = "BLOCKED_INSUFFICIENT_WS_EVENTS" + elif not comparisons: + gate = "BLOCKED_REST_COMPARISON" + elif no_state_count == len(comparisons): + gate = "BLOCKED_INSUFFICIENT_WS_EVENTS" + else: + gate = "BOOK_RECONSTRUCTION_SAMPLE_PASS" + if divergent_count: + warnings.append("REST comparison divergences were observed and are preserved with raw references; timing differences are possible in this short live sample.") + + readiness_finding = ( + "The sample proves that observed websocket `book` snapshots can initialize local state and `price_change` messages can update it offline. REST comparisons executed with raw references; divergences require review before a live websocket recorder replaces REST-only collection." + if gate == "BOOK_RECONSTRUCTION_SAMPLE_PASS" + else "The sample did not prove enough websocket reconstruction behavior for a live recorder migration." + ) + manifest = { + "schema_name": "checkpoint_010c_book_reconstruction_sample", + "schema_version": 1, + "checkpoint_id": "10C", + "checkpoint_name": "Offline Book Reconstruction And REST Comparison Sample", + "gate_status": gate, + "production_ready": False, + "live_kubernetes_collector_modified": False, + "reconstructor": {"name": RECONSTRUCTOR_NAME, "version": RECONSTRUCTOR_VERSION}, + "started_at_utc": started_at_utc, + "ended_at_utc": iso_z(), + "run_id": run_id, + "top_n": args.top_n, + "input": { + "manifest_path": input_manifest_path.as_posix(), + "manifest_sha256": sha256_file(input_manifest_path), + "gate_status": input_manifest.get("gate_status"), + "raw_files": [ + {"path": ws_file.as_posix(), "kind": "raw_websocket_messages", "sha256": sha256_file(ws_file), "expected_sha256": next(item["sha256"] for item in input_manifest["output_files"] if item["kind"] == "raw_websocket_messages")}, + {"path": rest_file.as_posix(), "kind": "rest_books_checkpoints", "sha256": sha256_file(rest_file), "expected_sha256": next(item["sha256"] for item in input_manifest["output_files"] if item["kind"] == "rest_books_checkpoints")}, + ], + }, + "output_files": [summarize_file(state_file, len(state_snapshots), "reconstructed_book_state_snapshots"), summarize_file(comparison_file, len(comparisons), "rest_comparison_rows")], + "event_type_counts": dict(sorted(event_type_counts.items())), + "observed_schema_summary": schema_summary, + "unsupported_event_counts": dict(sorted(unsupported_counts.items())), + "token_reconstruction_statuses": statuses, + "comparison_summary": {"comparison_count": len(comparisons), "match_count": match_count, "divergent_count": divergent_count, "no_state_count": no_state_count, "divergence_samples": [row for row in comparisons if row.get("comparison_status") == "divergent"][:10]}, + "assumptions": ["BUY price_change updates bids; SELL price_change updates asks.", "size=0 removes a level; non-zero size replaces that price level.", "REST checkpoint comparison uses websocket state after messages received at or before REST checkpoint received_at_utc."], + "warnings": sorted(set(warnings)), + "readiness_finding": readiness_finding, + "strongest_fake_progress_risk": "Ignoring unsupported websocket events or REST divergences would overstate full-fidelity readiness.", + "next_step": "If combined 10B/10C passes, plan 10D long-running Kubernetes websocket recorder with rotation, reconnect/stale-feed evidence, REST checkpoint recovery, upload cleanup, and migration plan.", + } + args.manifest_path.parent.mkdir(parents=True, exist_ok=True) + args.manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") + write_schema_docs(args.schema_doc_path, schema_summary) + write_reconstruction_doc(args.reconstruction_doc_path) + write_report(args.report_path, manifest) + return manifest + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Reconstruct Polymarket websocket books and compare to REST checkpoints.") + parser.add_argument("--input-manifest", type=Path, default=DEFAULT_INPUT_MANIFEST) + parser.add_argument("--output-root", type=Path, default=DEFAULT_OUTPUT_ROOT) + parser.add_argument("--manifest-path", type=Path, default=DEFAULT_MANIFEST_PATH) + parser.add_argument("--report-path", type=Path, default=DEFAULT_REPORT_PATH) + parser.add_argument("--schema-doc-path", type=Path, default=DEFAULT_SCHEMA_DOC) + parser.add_argument("--reconstruction-doc-path", type=Path, default=DEFAULT_RECON_DOC) + parser.add_argument("--top-n", type=int, default=10) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + manifest = run_reconstruction(args) + print(f"RECONSTRUCTION_MANIFEST={args.manifest_path}") + print(f"RECONSTRUCTION_REPORT={args.report_path}") + print(f"RECONSTRUCTION_GATE={manifest['gate_status']}") + return 0 if manifest["gate_status"] in {"BOOK_RECONSTRUCTION_SAMPLE_PASS", "BOOK_RECONSTRUCTION_NEEDS_REVIEW"} else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/record_polymarket_ws_sample.py b/scripts/record_polymarket_ws_sample.py new file mode 100755 index 0000000..00c866f --- /dev/null +++ b/scripts/record_polymarket_ws_sample.py @@ -0,0 +1,912 @@ +#!/usr/bin/env python3 +"""Bounded raw Polymarket websocket sample recorder with REST checkpoints. + +Checkpoint 10B scope: public BTC up/down market data only. This script is a +finite sample recorder. It does not trade, sign, authenticate, or modify the +live Kubernetes collector. +""" + +from __future__ import annotations + +import argparse +import base64 +import datetime as dt +import gzip +import hashlib +import json +import os +import signal +import socket +import ssl +import struct +import sys +import time +import urllib.error +import urllib.parse +import urllib.request +from pathlib import Path +from typing import Any + + +COLLECTOR_NAME = "polymarket_ws_sample_recorder" +COLLECTOR_VERSION = "0.1.0" +WS_SCHEMA_NAME = "raw_polymarket_market_ws_message" +REST_SCHEMA_NAME = "raw_polymarket_books_checkpoint" +SCHEMA_VERSION = 1 + +DEFAULT_CONFIG_PATH = Path("config/polymarket_ws_sample.example.yaml") +DEFAULT_DISCOVERY_PATH = Path("data/discovery/polymarket_btc_markets_latest.json") +DEFAULT_OUTPUT_ROOT = Path("data/ws_sample") +DEFAULT_MANIFEST_PATH = Path("data/manifests/checkpoint_010b_ws_raw_sample.json") +DEFAULT_REPORT_PATH = Path("reports/checkpoints/checkpoint_010b_ws_raw_sample.md") + +MARKET_WS_URL = "wss://ws-subscriptions-clob.polymarket.com/ws/market" +CLOB_BOOKS_URL = "https://clob.polymarket.com/books" + +SAFE_RESPONSE_HEADERS = { + "cache-control", + "cf-cache-status", + "cf-ray", + "content-length", + "content-type", + "date", + "retry-after", + "server", + "x-ratelimit-limit", + "x-ratelimit-remaining", + "x-ratelimit-reset", + "ratelimit-limit", + "ratelimit-remaining", + "ratelimit-reset", +} + +STOP_REQUESTED = False +STOP_SIGNAL: str | None = None + + +def handle_stop(signum: int, _frame: Any) -> None: + global STOP_REQUESTED, STOP_SIGNAL + STOP_REQUESTED = True + STOP_SIGNAL = signal.Signals(signum).name + + +def utc_now() -> dt.datetime: + return dt.datetime.now(dt.UTC) + + +def iso_z(value: dt.datetime | None = None) -> str: + value = value or utc_now() + return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def compact_timestamp(value: dt.datetime | None = None) -> str: + value = value or utc_now() + return value.astimezone(dt.UTC).strftime("%Y%m%dT%H%M%SZ") + + +def parse_iso(value: Any) -> dt.datetime | None: + if not isinstance(value, str) or not value.strip(): + return None + text = value.strip() + if text.endswith("Z"): + text = text[:-1] + "+00:00" + try: + parsed = dt.datetime.fromisoformat(text) + except ValueError: + return None + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=dt.UTC) + return parsed.astimezone(dt.UTC) + + +def sha256_bytes(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def parse_scalar(value: str) -> Any: + value = value.strip() + if not value: + return "" + if value[0] in {"'", '"'} and value[-1:] == value[0]: + return value[1:-1] + lower = value.lower() + if lower in {"true", "false"}: + return lower == "true" + if lower in {"null", "none"}: + return None + try: + return int(value) + except ValueError: + pass + try: + return float(value) + except ValueError: + return value + + +def load_flat_yaml(path: Path) -> dict[str, Any]: + """Parse the flat YAML subset used by this repo's example configs.""" + config: dict[str, Any] = {} + if not path.exists(): + return config + for line_number, raw_line in enumerate(path.read_text(encoding="utf-8").splitlines(), 1): + line = raw_line.split("#", 1)[0].strip() + if not line: + continue + if ":" not in line: + raise ValueError(f"Unsupported config line {line_number}: {raw_line}") + key, value = line.split(":", 1) + key = key.strip() + if not key: + raise ValueError(f"Missing config key on line {line_number}") + config[key] = parse_scalar(value) + return config + + +def filter_headers(headers: Any) -> dict[str, str]: + safe: dict[str, str] = {} + for key, value in dict(headers).items(): + if key.lower() in SAFE_RESPONSE_HEADERS: + safe[key] = value + return safe + + +def load_discovery(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def market_is_usable(market: dict[str, Any], now: dt.datetime, safety_seconds: int) -> tuple[bool, list[str]]: + reasons: list[str] = [] + if market.get("active") is not True: + reasons.append("not_active") + if market.get("closed") is not False: + reasons.append("closed") + if market.get("accepting_orders") is not True: + reasons.append("not_accepting_orders") + if market.get("enable_order_book") is not True: + reasons.append("order_book_not_enabled") + end_time = parse_iso(market.get("end_time_utc")) + if end_time is None: + reasons.append("missing_end_time") + elif end_time <= now + dt.timedelta(seconds=safety_seconds): + reasons.append("too_close_to_end_or_expired") + tokens = market.get("tokens") + if not isinstance(tokens, list) or len(tokens) < 2: + reasons.append("missing_two_tokens") + else: + outcomes = [token.get("outcome") for token in tokens if isinstance(token, dict)] + token_ids = [token.get("token_id") for token in tokens if isinstance(token, dict)] + if outcomes[:2] != ["Up", "Down"] or not all(token_ids[:2]): + reasons.append("bad_up_down_token_mapping") + return not reasons, reasons + + +def select_markets( + discovery: dict[str, Any], + *, + market_limit: int, + market_end_safety_seconds: int, +) -> tuple[list[dict[str, Any]], dict[str, int]]: + now = utc_now() + selected: list[dict[str, Any]] = [] + rejection_counts: dict[str, int] = {} + for market in discovery.get("normalized_markets") or []: + if not isinstance(market, dict): + rejection_counts["not_object"] = rejection_counts.get("not_object", 0) + 1 + continue + usable, reasons = market_is_usable(market, now, market_end_safety_seconds) + if not usable: + for reason in reasons: + rejection_counts[reason] = rejection_counts.get(reason, 0) + 1 + continue + selected.append(market) + if len(selected) >= market_limit: + break + return selected, dict(sorted(rejection_counts.items())) + + +def flatten_tokens(markets: list[dict[str, Any]]) -> list[dict[str, Any]]: + tokens: list[dict[str, Any]] = [] + for market in markets: + for token in market.get("tokens", [])[:2]: + if not isinstance(token, dict): + continue + tokens.append( + { + "market_name": market.get("market_name"), + "market_slug": market.get("market_slug"), + "condition_id": market.get("condition_id"), + "token_id": str(token.get("token_id")), + "outcome": token.get("outcome"), + "outcome_index": token.get("outcome_index"), + "market_end_time_utc": market.get("end_time_utc"), + } + ) + return tokens + + +def decode_json_maybe(text: str) -> tuple[Any | None, str | None]: + try: + return json.loads(text), None + except json.JSONDecodeError as exc: + return None, str(exc) + + +def classify_ws_payload(payload: Any) -> list[str]: + event_types: list[str] = [] + items = payload if isinstance(payload, list) else [payload] + for item in items: + if not isinstance(item, dict): + event_types.append(type(item).__name__) + continue + event_type = item.get("event_type") + if event_type: + event_types.append(str(event_type)) + elif {"market", "asset_id", "bids", "asks", "timestamp"}.issubset(item.keys()): + event_types.append("book_without_event_type") + else: + event_types.append("unknown_object") + return event_types + + +def send_ws_frame(sock: ssl.SSLSocket, opcode: int, payload: bytes) -> None: + mask = os.urandom(4) + header = bytearray([0x80 | opcode]) + length = len(payload) + if length < 126: + header.append(0x80 | length) + elif length < 65536: + header.append(0x80 | 126) + header.extend(struct.pack("!H", length)) + else: + header.append(0x80 | 127) + header.extend(struct.pack("!Q", length)) + masked = bytes(byte ^ mask[index % 4] for index, byte in enumerate(payload)) + sock.sendall(header + mask + masked) + + +def read_exact(sock: ssl.SSLSocket, length: int) -> bytes: + data = bytearray() + while len(data) < length: + chunk = sock.recv(length - len(data)) + if not chunk: + raise EOFError("websocket connection closed while reading frame") + data.extend(chunk) + return bytes(data) + + +def read_ws_frame(sock: ssl.SSLSocket) -> tuple[int, bytes]: + first, second = read_exact(sock, 2) + opcode = first & 0x0F + length = second & 0x7F + masked = bool(second & 0x80) + if length == 126: + length = struct.unpack("!H", read_exact(sock, 2))[0] + elif length == 127: + length = struct.unpack("!Q", read_exact(sock, 8))[0] + mask = read_exact(sock, 4) if masked else b"" + payload = read_exact(sock, length) if length else b"" + if masked: + payload = bytes(byte ^ mask[index % 4] for index, byte in enumerate(payload)) + return opcode, payload + + +def parse_ws_headers(raw_headers: str) -> tuple[str, dict[str, str]]: + lines = raw_headers.split("\r\n") + status_line = lines[0] if lines else "" + headers: dict[str, str] = {} + for line in lines[1:]: + if ":" not in line: + continue + key, value = line.split(":", 1) + headers[key.strip()] = value.strip() + return status_line, filter_headers(headers) + + +def open_websocket(url: str, timeout_seconds: float) -> tuple[ssl.SSLSocket, dict[str, Any]]: + parsed = urllib.parse.urlparse(url) + host = parsed.hostname + if not host: + raise ValueError("missing websocket host") + port = parsed.port or 443 + path = parsed.path or "/" + if parsed.query: + path = f"{path}?{parsed.query}" + raw_sock = socket.create_connection((host, port), timeout=timeout_seconds) + sock = ssl.create_default_context().wrap_socket(raw_sock, server_hostname=host) + sock.settimeout(timeout_seconds) + key = base64.b64encode(os.urandom(16)).decode("ascii") + request = ( + f"GET {path} HTTP/1.1\r\n" + f"Host: {host}\r\n" + "Upgrade: websocket\r\n" + "Connection: Upgrade\r\n" + f"Sec-WebSocket-Key: {key}\r\n" + "Sec-WebSocket-Version: 13\r\n" + "User-Agent: orderbooks-checkpoint-10b-ws-sample/0.1.0\r\n" + "\r\n" + ) + sock.sendall(request.encode("ascii")) + raw_headers = bytearray() + while b"\r\n\r\n" not in raw_headers: + raw_headers.extend(sock.recv(4096)) + if len(raw_headers) > 65536: + raise ValueError("websocket handshake headers exceeded 64 KiB") + header_text = bytes(raw_headers).split(b"\r\n\r\n", 1)[0].decode( + "iso-8859-1", errors="replace" + ) + status_line, response_headers = parse_ws_headers(header_text) + if " 101 " not in status_line: + raise ValueError(f"websocket upgrade failed: {status_line}") + return sock, {"status_line": status_line, "headers": response_headers} + + +def http_post_books( + *, + url: str, + token_ids: list[str], + timeout_seconds: float, +) -> dict[str, Any]: + requested_at_utc = iso_z() + started = time.monotonic() + request_body = [{"token_id": token_id} for token_id in token_ids] + body_bytes = json.dumps(request_body, separators=(",", ":")).encode("utf-8") + status_code: int | None = None + headers: dict[str, str] = {} + response_text = "" + error: str | None = None + try: + request = urllib.request.Request( + url, + data=body_bytes, + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + "User-Agent": "orderbooks-checkpoint-10b-ws-sample/0.1.0", + }, + method="POST", + ) + with urllib.request.urlopen(request, timeout=timeout_seconds) as response: + status_code = response.status + headers = filter_headers(response.headers) + response_text = response.read().decode("utf-8", errors="replace") + except urllib.error.HTTPError as exc: + status_code = exc.code + headers = filter_headers(exc.headers) + response_text = exc.read().decode("utf-8", errors="replace") + error = f"HTTPError: {exc}" + except Exception as exc: # noqa: BLE001 - preserve request failure evidence + error = f"{type(exc).__name__}: {exc}" + + parsed_json, json_error = decode_json_maybe(response_text) if response_text else (None, None) + return { + "requested_at_utc": requested_at_utc, + "received_at_utc": iso_z(), + "duration_ms": round((time.monotonic() - started) * 1000, 3), + "request_body": request_body, + "status_code": status_code, + "headers": headers, + "raw_response_json": parsed_json, + "json_error": json_error, + "raw_response_text_sha256": sha256_bytes(response_text.encode("utf-8")), + "raw_response_length_bytes": len(response_text.encode("utf-8")), + "raw_response_text_preview": response_text[:1000] if parsed_json is None else None, + "error": error, + "ok": error is None and status_code is not None and 200 <= status_code < 300 and json_error is None, + } + + +def write_jsonl(handle: gzip.GzipFile, row: dict[str, Any]) -> None: + handle.write((json.dumps(row, separators=(",", ":"), sort_keys=True) + "\n").encode("utf-8")) + + +def summarize_output_file(path: Path, rows_written: int, kind: str) -> dict[str, Any]: + return { + "path": path.as_posix(), + "kind": kind, + "status": "valid" if path.exists() and path.stat().st_size > 0 else "missing_or_empty", + "bytes": path.stat().st_size if path.exists() else 0, + "rows": rows_written, + "sha256": sha256_file(path) if path.exists() else None, + } + + +def config_value(config: dict[str, Any], args: argparse.Namespace, key: str, default: Any) -> Any: + value = getattr(args, key) + if value is not None: + return value + return config.get(key, default) + + +def build_runtime_config(args: argparse.Namespace) -> dict[str, Any]: + file_config = load_flat_yaml(args.config) if args.config else {} + runtime = { + "config_path": args.config, + "config_sha256": sha256_file(args.config) if args.config and args.config.exists() else None, + "config_snapshot": file_config, + "discovery_path": Path(config_value(file_config, args, "discovery_path", DEFAULT_DISCOVERY_PATH)), + "output_root": Path(config_value(file_config, args, "output_root", DEFAULT_OUTPUT_ROOT)), + "manifest_path": Path(config_value(file_config, args, "manifest_path", DEFAULT_MANIFEST_PATH)), + "report_path": Path(config_value(file_config, args, "report_path", DEFAULT_REPORT_PATH)), + "market_limit": int(config_value(file_config, args, "market_limit", 2)), + "duration_seconds": float(config_value(file_config, args, "duration_seconds", 150.0)), + "rest_checkpoint_interval_seconds": float(config_value(file_config, args, "rest_checkpoint_interval_seconds", 30.0)), + "request_timeout_seconds": float(config_value(file_config, args, "request_timeout_seconds", 15.0)), + "websocket_timeout_seconds": float(config_value(file_config, args, "websocket_timeout_seconds", 15.0)), + "websocket_url": str(config_value(file_config, args, "websocket_url", MARKET_WS_URL)), + "clob_books_url": str(config_value(file_config, args, "clob_books_url", CLOB_BOOKS_URL)), + "max_reconnects": int(config_value(file_config, args, "max_reconnects", 2)), + "reconnect_backoff_seconds": float(config_value(file_config, args, "reconnect_backoff_seconds", 3.0)), + "market_end_safety_seconds": int(config_value(file_config, args, "market_end_safety_seconds", 420)), + } + if runtime["market_limit"] < 1: + raise ValueError("market_limit must be >= 1") + if runtime["duration_seconds"] <= 0: + raise ValueError("duration_seconds must be > 0") + if runtime["rest_checkpoint_interval_seconds"] <= 0: + raise ValueError("rest_checkpoint_interval_seconds must be > 0") + return runtime + + +def build_ws_envelope( + *, + run_id: str, + session_id: str, + connection_sequence: int, + message_sequence: int, + global_message_sequence: int, + received_at_utc: str, + websocket_url: str, + subscription: dict[str, Any], + tokens: list[dict[str, Any]], + opcode: int, + payload_bytes: bytes, +) -> tuple[dict[str, Any], list[str], bool]: + decode_error = None + try: + raw_text = payload_bytes.decode("utf-8") + except UnicodeDecodeError as exc: + decode_error = str(exc) + raw_text = payload_bytes.decode("utf-8", errors="replace") + parsed_json, json_error = decode_json_maybe(raw_text) if decode_error is None else (None, decode_error) + event_types = classify_ws_payload(parsed_json) if parsed_json is not None else ["unparseable_text"] + envelope = { + "schema_name": WS_SCHEMA_NAME, + "schema_version": SCHEMA_VERSION, + "collector": {"name": COLLECTOR_NAME, "version": COLLECTOR_VERSION}, + "run_id": run_id, + "session_id": session_id, + "connection_sequence": connection_sequence, + "message_sequence": message_sequence, + "global_message_sequence": global_message_sequence, + "received_at_utc": received_at_utc, + "websocket": {"url": websocket_url}, + "subscription": subscription, + "tokens_tracked": tokens, + "opcode": opcode, + "payload_length_bytes": len(payload_bytes), + "payload_sha256": sha256_bytes(payload_bytes), + "raw_text": raw_text, + "json": parsed_json, + "json_error": json_error, + "classified_event_types": event_types, + } + return envelope, event_types, parsed_json is not None + + +def write_report(path: Path, manifest: dict[str, Any]) -> None: + lines = [ + "# Checkpoint 10B Raw Websocket And REST Checkpoint Sample", + "", + f"Status: {manifest['gate_status']} ", + f"Created: {manifest['ended_at_utc']} ", + "Production ready: no ", + "Live Kubernetes collector modified: no", + "", + "## Scope", + "", + "Bounded public Polymarket BTC up/down websocket sample only. The live REST collector was not changed.", + "", + "## Markets And Tokens", + "", + f"- Markets tracked: `{len(manifest['markets_tracked'])}`.", + f"- Tokens tracked: `{len(manifest['tokens_tracked'])}`.", + f"- Discovery path: `{manifest['discovery']['path']}`.", + "", + "## Websocket Evidence", + "", + f"- URL: `{manifest['websocket']['url']}`.", + f"- Connected sessions: `{manifest['websocket']['connected_session_count']}`.", + f"- Reconnect count: `{manifest['websocket']['reconnect_count']}`.", + f"- Text messages written: `{manifest['websocket']['message_count']}`.", + f"- Parseable JSON messages: `{manifest['websocket']['parsed_json_count']}`.", + f"- Event type counts: `{json.dumps(manifest['websocket']['event_type_counts'], sort_keys=True)}`.", + f"- Opcode counts: `{json.dumps(manifest['websocket']['opcode_counts'], sort_keys=True)}`.", + f"- Maximum seconds between websocket text messages: `{manifest['websocket']['max_seconds_between_text_messages']}`.", + "", + "## REST Checkpoint Evidence", + "", + f"- Endpoint: `{manifest['rest_checkpoints']['url']}`.", + f"- Interval seconds: `{manifest['rest_checkpoints']['interval_seconds']}`.", + f"- Requests: `{manifest['rest_checkpoints']['request_count']}`.", + f"- Successes: `{manifest['rest_checkpoints']['success_count']}`.", + f"- Failures: `{manifest['rest_checkpoints']['failure_count']}`.", + "", + "## Output Files", + "", + ] + for output in manifest["output_files"]: + lines.append( + f"- `{output['path']}`: `{output['kind']}`, rows `{output['rows']}`, bytes `{output['bytes']}`, sha256 `{output['sha256']}`" + ) + lines.extend(["", "## Warnings", ""]) + if manifest["warnings"]: + lines.extend(f"- {warning}" for warning in manifest["warnings"]) + else: + lines.append("- None.") + lines.extend(["", "## Errors", ""]) + if manifest["errors"]: + lines.extend(f"- `{item.get('stage')}`: {item.get('error')}" for item in manifest["errors"]) + else: + lines.append("- None.") + lines.extend( + [ + "", + "## Gate", + "", + manifest["gate_status"], + "", + "## Strongest Fake Progress Risk", + "", + "Receiving websocket traffic is not enough by itself; raw payloads must remain preserved and the offline reconstruction must compare against REST checkpoints before this path can inform the live collector design.", + "", + "## Next Smallest Step", + "", + "If this gate is `WS_RAW_SAMPLE_PASS`, run Checkpoint 10C reconstruction from the raw files referenced by this manifest.", + "", + ] + ) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines), encoding="utf-8") + + +def run_sample(runtime: dict[str, Any], command: str) -> dict[str, Any]: + signal.signal(signal.SIGINT, handle_stop) + signal.signal(signal.SIGTERM, handle_stop) + + started = utc_now() + started_at_utc = iso_z(started) + run_id = compact_timestamp(started) + discovery = load_discovery(runtime["discovery_path"]) + markets, rejection_counts = select_markets( + discovery, + market_limit=runtime["market_limit"], + market_end_safety_seconds=runtime["market_end_safety_seconds"], + ) + tokens = flatten_tokens(markets) + token_ids = [token["token_id"] for token in tokens] + subscription = {"assets_ids": token_ids, "type": "market", "custom_feature_enabled": True} + + ws_dir = runtime["output_root"] / "polymarket" / "ws_raw" / run_id + rest_dir = runtime["output_root"] / "polymarket" / "rest_checkpoints" / run_id + ws_dir.mkdir(parents=True, exist_ok=True) + rest_dir.mkdir(parents=True, exist_ok=True) + ws_file = ws_dir / f"polymarket_ws_raw_{run_id}.jsonl.gz" + rest_file = rest_dir / f"polymarket_rest_checkpoints_{run_id}.jsonl.gz" + + warnings: list[str] = [] + errors: list[dict[str, Any]] = [] + sessions: list[dict[str, Any]] = [] + event_type_counts: dict[str, int] = {} + opcode_counts: dict[str, int] = {} + message_count = 0 + parsed_json_count = 0 + rest_request_count = 0 + rest_success_count = 0 + rest_failure_count = 0 + ws_rows = 0 + rest_rows = 0 + connected_session_count = 0 + reconnect_count = 0 + last_text_message_monotonic: float | None = None + max_gap_seconds: float | None = None + + if not markets or not tokens: + warnings.append("No usable active BTC markets/tokens were selected from discovery input.") + + deadline = time.monotonic() + runtime["duration_seconds"] + next_checkpoint_monotonic = time.monotonic() + connection_sequence = 0 + global_message_sequence = 0 + + def write_rest_checkpoint(rest_handle: gzip.GzipFile) -> None: + nonlocal rest_request_count, rest_success_count, rest_failure_count, rest_rows, next_checkpoint_monotonic + rest_request_count += 1 + checkpoint = http_post_books( + url=runtime["clob_books_url"], + token_ids=token_ids, + timeout_seconds=runtime["request_timeout_seconds"], + ) + if checkpoint["ok"]: + rest_success_count += 1 + else: + rest_failure_count += 1 + envelope = { + "schema_name": REST_SCHEMA_NAME, + "schema_version": SCHEMA_VERSION, + "collector": {"name": COLLECTOR_NAME, "version": COLLECTOR_VERSION}, + "run_id": run_id, + "checkpoint_sequence": rest_request_count, + "requested_at_utc": checkpoint["requested_at_utc"], + "received_at_utc": checkpoint["received_at_utc"], + "duration_ms": checkpoint["duration_ms"], + "request": {"method": "POST", "url": runtime["clob_books_url"], "token_ids": token_ids, "json_body": checkpoint["request_body"]}, + "response": { + "status_code": checkpoint["status_code"], + "headers": checkpoint["headers"], + "raw_response_json": checkpoint["raw_response_json"], + "json_error": checkpoint["json_error"], + "raw_response_text_sha256": checkpoint["raw_response_text_sha256"], + "raw_response_length_bytes": checkpoint["raw_response_length_bytes"], + "raw_response_text_preview": checkpoint["raw_response_text_preview"], + "error": checkpoint["error"], + }, + "ok": checkpoint["ok"], + "tokens_tracked": tokens, + } + write_jsonl(rest_handle, envelope) + rest_handle.flush() + rest_rows += 1 + next_checkpoint_monotonic += runtime["rest_checkpoint_interval_seconds"] + + with gzip.open(ws_file, "wb") as ws_handle, gzip.open(rest_file, "wb") as rest_handle: + if token_ids: + write_rest_checkpoint(rest_handle) + while token_ids and not STOP_REQUESTED and time.monotonic() < deadline: + sock: ssl.SSLSocket | None = None + connection_sequence += 1 + session_id = f"{run_id}-ws{connection_sequence}" + session: dict[str, Any] = { + "session_id": session_id, + "connection_sequence": connection_sequence, + "connected_at_utc": None, + "subscribed_at_utc": None, + "closed_at_utc": None, + "handshake": None, + "message_count": 0, + "close_reason": None, + "error": None, + } + sessions.append(session) + try: + sock, handshake = open_websocket(runtime["websocket_url"], runtime["websocket_timeout_seconds"]) + connected_session_count += 1 + session["connected_at_utc"] = iso_z() + session["handshake"] = handshake + send_ws_frame(sock, 0x1, json.dumps(subscription, separators=(",", ":")).encode("utf-8")) + session["subscribed_at_utc"] = iso_z() + message_sequence = 0 + while not STOP_REQUESTED and time.monotonic() < deadline: + while token_ids and time.monotonic() >= next_checkpoint_monotonic and time.monotonic() < deadline: + write_rest_checkpoint(rest_handle) + timeout = min(1.0, max(0.1, deadline - time.monotonic())) + if next_checkpoint_monotonic > time.monotonic(): + timeout = min(timeout, max(0.1, next_checkpoint_monotonic - time.monotonic())) + sock.settimeout(timeout) + try: + opcode, payload = read_ws_frame(sock) + except socket.timeout: + continue + opcode_key = str(opcode) + opcode_counts[opcode_key] = opcode_counts.get(opcode_key, 0) + 1 + if opcode == 0x8: + session["close_reason"] = "close_frame" + session["closed_at_utc"] = iso_z() + break + if opcode == 0x9: + send_ws_frame(sock, 0xA, payload) + continue + if opcode != 0x1: + continue + received_at_utc = iso_z() + now_mono = time.monotonic() + if last_text_message_monotonic is not None: + gap = now_mono - last_text_message_monotonic + max_gap_seconds = gap if max_gap_seconds is None else max(max_gap_seconds, gap) + last_text_message_monotonic = now_mono + message_sequence += 1 + global_message_sequence += 1 + message_count += 1 + session["message_count"] += 1 + envelope, event_types, parsed_ok = build_ws_envelope( + run_id=run_id, + session_id=session_id, + connection_sequence=connection_sequence, + message_sequence=message_sequence, + global_message_sequence=global_message_sequence, + received_at_utc=received_at_utc, + websocket_url=runtime["websocket_url"], + subscription=subscription, + tokens=tokens, + opcode=opcode, + payload_bytes=payload, + ) + if parsed_ok: + parsed_json_count += 1 + for event_type in event_types: + event_type_counts[event_type] = event_type_counts.get(event_type, 0) + 1 + write_jsonl(ws_handle, envelope) + ws_handle.flush() + ws_rows += 1 + if session.get("close_reason") == "close_frame": + reconnect_count += 1 + if reconnect_count > runtime["max_reconnects"]: + warnings.append("Maximum reconnect count reached after websocket close frame.") + break + time.sleep(runtime["reconnect_backoff_seconds"]) + continue + break + except Exception as exc: # noqa: BLE001 - preserve websocket failure evidence + session["error"] = f"{type(exc).__name__}: {exc}" + session["closed_at_utc"] = iso_z() + errors.append({"stage": "websocket", "session_id": session_id, "error": session["error"]}) + reconnect_count += 1 + if reconnect_count > runtime["max_reconnects"]: + break + time.sleep(runtime["reconnect_backoff_seconds"]) + finally: + if sock is not None: + try: + send_ws_frame(sock, 0x8, b"") + except Exception: + pass + try: + sock.close() + except Exception: + pass + session["closed_at_utc"] = session.get("closed_at_utc") or iso_z() + while token_ids and time.monotonic() >= next_checkpoint_monotonic and rest_request_count < 2: + write_rest_checkpoint(rest_handle) + + ended = utc_now() + ended_at_utc = iso_z(ended) + if STOP_REQUESTED: + warnings.append(f"Stop requested by {STOP_SIGNAL}.") + max_gap_seconds_value = None if max_gap_seconds is None and message_count <= 1 else round(max_gap_seconds or 0.0, 3) + + output_files = [ + summarize_output_file(ws_file, ws_rows, "raw_websocket_messages"), + summarize_output_file(rest_file, rest_rows, "rest_books_checkpoints"), + ] + + if not markets or not tokens: + gate_status = "BLOCKED_DISCOVERY" + elif connected_session_count == 0: + gate_status = "BLOCKED_WS_CONNECTIVITY" + elif rest_success_count < 2: + gate_status = "BLOCKED_REST_CHECKPOINTS" + elif message_count >= 1 and parsed_json_count >= 1: + gate_status = "WS_RAW_SAMPLE_PASS" + else: + gate_status = "WS_RAW_SAMPLE_NEEDS_REVIEW" + warnings.append("Websocket connected/subscribed but did not produce at least one parseable text message.") + + manifest = { + "schema_name": "checkpoint_010b_ws_raw_sample", + "schema_version": 1, + "checkpoint_id": "10B", + "checkpoint_name": "Raw Websocket And REST Checkpoint Sample", + "gate_status": gate_status, + "production_ready": False, + "live_kubernetes_collector_modified": False, + "collector": {"name": COLLECTOR_NAME, "version": COLLECTOR_VERSION}, + "command": command, + "run_id": run_id, + "started_at_utc": started_at_utc, + "ended_at_utc": ended_at_utc, + "configured_duration_seconds": runtime["duration_seconds"], + "actual_duration_seconds": round((ended - started).total_seconds(), 3), + "config": { + "path": runtime["config_path"].as_posix() if runtime["config_path"] else None, + "sha256": runtime["config_sha256"], + "snapshot": runtime["config_snapshot"], + "effective": { + "discovery_path": runtime["discovery_path"].as_posix(), + "output_root": runtime["output_root"].as_posix(), + "manifest_path": runtime["manifest_path"].as_posix(), + "report_path": runtime["report_path"].as_posix(), + "market_limit": runtime["market_limit"], + "duration_seconds": runtime["duration_seconds"], + "rest_checkpoint_interval_seconds": runtime["rest_checkpoint_interval_seconds"], + "request_timeout_seconds": runtime["request_timeout_seconds"], + "websocket_timeout_seconds": runtime["websocket_timeout_seconds"], + "websocket_url": runtime["websocket_url"], + "clob_books_url": runtime["clob_books_url"], + "max_reconnects": runtime["max_reconnects"], + "reconnect_backoff_seconds": runtime["reconnect_backoff_seconds"], + "market_end_safety_seconds": runtime["market_end_safety_seconds"], + }, + }, + "discovery": { + "path": runtime["discovery_path"].as_posix(), + "sha256": sha256_file(runtime["discovery_path"]) if runtime["discovery_path"].exists() else None, + "fetched_at_utc": discovery.get("fetched_at_utc"), + "summary": discovery.get("summary"), + "rejection_counts_before_selection": rejection_counts, + }, + "markets_tracked": [ + {"market_slug": market.get("market_slug"), "condition_id": market.get("condition_id"), "end_time_utc": market.get("end_time_utc")} + for market in markets + ], + "tokens_tracked": tokens, + "websocket": { + "url": runtime["websocket_url"], + "subscription": subscription, + "sessions": sessions, + "connected_session_count": connected_session_count, + "reconnect_count": reconnect_count, + "message_count": message_count, + "parsed_json_count": parsed_json_count, + "event_type_counts": dict(sorted(event_type_counts.items())), + "opcode_counts": dict(sorted(opcode_counts.items())), + "max_seconds_between_text_messages": max_gap_seconds_value, + }, + "rest_checkpoints": { + "url": runtime["clob_books_url"], + "interval_seconds": runtime["rest_checkpoint_interval_seconds"], + "request_count": rest_request_count, + "success_count": rest_success_count, + "failure_count": rest_failure_count, + }, + "output_files": output_files, + "warnings": warnings, + "errors": errors, + "strongest_fake_progress_risk": "Websocket traffic without offline reconstruction and REST comparison is only raw-source evidence, not proof of full-fidelity correctness.", + "next_step": "Run Checkpoint 10C reconstruction from this manifest if gate_status is WS_RAW_SAMPLE_PASS.", + } + runtime["manifest_path"].parent.mkdir(parents=True, exist_ok=True) + runtime["manifest_path"].write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") + write_report(runtime["report_path"], manifest) + return manifest + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Record a bounded Polymarket BTC websocket sample plus REST checkpoints.") + parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH) + parser.add_argument("--discovery-path", type=Path, default=None) + parser.add_argument("--output-root", type=Path, default=None) + parser.add_argument("--manifest-path", type=Path, default=None) + parser.add_argument("--report-path", type=Path, default=None) + parser.add_argument("--market-limit", type=int, default=None) + parser.add_argument("--duration-seconds", type=float, default=None) + parser.add_argument("--rest-checkpoint-interval-seconds", type=float, default=None) + parser.add_argument("--request-timeout-seconds", type=float, default=None) + parser.add_argument("--websocket-timeout-seconds", type=float, default=None) + parser.add_argument("--websocket-url", type=str, default=None) + parser.add_argument("--clob-books-url", type=str, default=None) + parser.add_argument("--max-reconnects", type=int, default=None) + parser.add_argument("--reconnect-backoff-seconds", type=float, default=None) + parser.add_argument("--market-end-safety-seconds", type=int, default=None) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + runtime = build_runtime_config(args) + manifest = run_sample(runtime, " ".join(sys.argv)) + print(f"WS_SAMPLE_MANIFEST={runtime['manifest_path']}") + print(f"WS_SAMPLE_REPORT={runtime['report_path']}") + print(f"WS_SAMPLE_GATE={manifest['gate_status']}") + return 0 if manifest["gate_status"] in {"WS_RAW_SAMPLE_PASS", "WS_RAW_SAMPLE_NEEDS_REVIEW"} else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_polymarket_ws_recorder_loop.sh b/scripts/run_polymarket_ws_recorder_loop.sh new file mode 100755 index 0000000..fb446bb --- /dev/null +++ b/scripts/run_polymarket_ws_recorder_loop.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -euo pipefail + +APP_DIR="${ORDERBOOKS_APP_DIR:-/app}" +CONFIG_PATH="${ORDERBOOKS_WS_COLLECTOR_CONFIG:-/etc/orderbooks/polymarket_ws_collector.yaml}" +PYTHON_BIN="${ORDERBOOKS_PYTHON:-python3}" + +cd "${APP_DIR}" + +args=(scripts/collect_polymarket_ws_orderbooks.py --config "${CONFIG_PATH}") +if [[ -n "${ORDERBOOKS_WS_DURATION_SECONDS:-}" ]]; then + args+=(--duration-seconds "${ORDERBOOKS_WS_DURATION_SECONDS}") +fi +if [[ -n "${ORDERBOOKS_WS_MARKET_LIMIT:-}" ]]; then + args+=(--market-limit "${ORDERBOOKS_WS_MARKET_LIMIT}") +fi + +echo "orderbooks websocket recorder starting at $(date -u +%Y-%m-%dT%H:%M:%SZ)" +echo "config path: ${CONFIG_PATH}" +exec "${PYTHON_BIN}" "${args[@]}" diff --git a/scripts/upload_archive_rclone.sh b/scripts/upload_archive_rclone.sh index c43d9a7..beb8b1e 100755 --- a/scripts/upload_archive_rclone.sh +++ b/scripts/upload_archive_rclone.sh @@ -188,6 +188,17 @@ for root, kind in [(raw_dir, "raw"), (source_manifest_dir, "manifest")]: continue seen.add(resolved) rel = rel_for(path) + if path.name.startswith('.') or path.suffix in {'.open', '.tmp', '.partial'} or path.name.endswith(('.open', '.tmp', '.partial')): + skipped.append({ + "local_path": str(path), + "relative_path": rel, + "kind": kind, + "bytes": path.stat().st_size, + "mtime_utc": iso_z_from_ts(path.stat().st_mtime), + "age_seconds": max(0, int(now.timestamp() - path.stat().st_mtime)), + "reason": "open_or_temporary_file", + }) + continue stat = path.stat() age_seconds = max(0, int(now.timestamp() - stat.st_mtime)) base = {