commit 284e465588d53875e30a955b6c62db91c5fbee19 Author: philipp Date: Sat Apr 18 11:23:28 2026 +0200 Prepare Kubernetes orderbooks deployment diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d8e90c2 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,26 @@ +.git/ +.venv/ +__pycache__/ +*.pyc +*.pyo +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +artifacts/ +data/ +reports/ +orchestration/ +.env +*.env +rclone.conf +**/rclone.conf +*.pem +*.key +*.p12 +*.pfx +id_rsa* +id_ed25519* +*mnemonic* +*wallet* +*credential* +*secret* diff --git a/.forgejo/workflows/deploy.yml b/.forgejo/workflows/deploy.yml new file mode 100644 index 0000000..67006ca --- /dev/null +++ b/.forgejo/workflows/deploy.yml @@ -0,0 +1,162 @@ +name: deploy + +on: + push: + branches: + - main + workflow_dispatch: + +jobs: + deploy: + runs-on: linux-amd64 + env: + IMAGE_TAG: ${{ github.sha }} + REGISTRY_HOST: ${{ vars.REGISTRY_HOST }} + PROJECT_NAME: ${{ vars.PROJECT_NAME || 'orderbooks' }} + PROJECT_NAMESPACE: ${{ vars.PROJECT_NAMESPACE || 'orderbooks' }} + PROJECT_DEPLOYMENTS: ${{ vars.PROJECT_DEPLOYMENTS || 'orderbooks-collector' }} + PROJECT_REGISTRY_SECRET_NAME: ${{ vars.PROJECT_REGISTRY_SECRET_NAME || 'orderbooks-registry-creds' }} + REPO_CLONE_URL: ${{ github.server_url }}/${{ github.repository }}.git + steps: + - name: Install tooling + run: | + if command -v git >/dev/null 2>&1 && command -v kubectl >/dev/null 2>&1 && command -v python3 >/dev/null 2>&1; then + exit 0 + fi + + if command -v apk >/dev/null 2>&1; then + apk add --no-cache git kubectl python3 + exit 0 + fi + + if command -v apt-get >/dev/null 2>&1; then + apt-get update + apt-get install -y git curl ca-certificates python3 + curl -fsSLo /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + chmod +x /usr/local/bin/kubectl + exit 0 + fi + + echo "missing git/kubectl/python3 and no supported package manager found" >&2 + exit 1 + + - name: Prepare workspace + run: | + workspace_root="${RUNNER_TEMP:-/tmp}" + workspace_dir="$(mktemp -d "${workspace_root%/}/orderbooks-deploy-XXXXXX")" + echo "WORKSPACE_DIR=$workspace_dir" >> "$GITHUB_ENV" + echo "runner workspace: $workspace_dir" + + - name: Load kubeconfig + run: | + mkdir -p "$HOME/.kube" + printf '%s' '${{ secrets.KUBECONFIG_B64 }}' | base64 -d > "$HOME/.kube/config" + kubectl get ns + + - name: Checkout repo + env: + REPO_TOKEN: ${{ github.token }} + run: | + git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" clone --depth=1 "${REPO_CLONE_URL}" "$WORKSPACE_DIR" + cd "$WORKSPACE_DIR" + current_sha="$(git rev-parse HEAD)" + if [ "$current_sha" != "$GITHUB_SHA" ]; then + git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" fetch --depth=1 origin "${GITHUB_SHA}" + git checkout --detach "${GITHUB_SHA}" + else + git checkout --detach "$current_sha" + fi + git rev-parse HEAD + + - name: Resolve deployment settings + run: | + if [ -z "${REGISTRY_HOST:-}" ]; then + echo "REGISTRY_HOST repo variable is required" >&2 + exit 1 + fi + IMAGE="$REGISTRY_HOST/$PROJECT_NAME:$IMAGE_TAG" + BUILD_JOB="image-build-$(printf '%s' "$GITHUB_SHA" | cut -c1-12)" + { + echo "IMAGE=$IMAGE" + echo "BUILD_JOB=$BUILD_JOB" + } >> "$GITHUB_ENV" + + - name: Ensure namespace exists + run: | + kubectl apply -f "$WORKSPACE_DIR/deploy/k8s/base/namespace.yaml" + + - name: Build and push image in-cluster + env: + REPO_TOKEN: ${{ github.token }} + run: | + kubectl -n "$PROJECT_NAMESPACE" delete job "$BUILD_JOB" --ignore-not-found=true + cat <- + git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" clone --depth=1 "${REPO_CLONE_URL}" /workspace && + cd /workspace && + git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" fetch --depth=1 origin "${GITHUB_SHA}" && + git checkout --detach "${GITHUB_SHA}" + volumeMounts: + - name: workspace + mountPath: /workspace + containers: + - name: kaniko + image: gcr.io/kaniko-project/executor:v1.23.2-debug + args: + - --context=/workspace + - --dockerfile=/workspace/Dockerfile + - --destination=${IMAGE} + - --cache=false + volumeMounts: + - name: workspace + mountPath: /workspace + - name: registry-creds + mountPath: /kaniko/.docker + EOF + kubectl -n "$PROJECT_NAMESPACE" wait --for=condition=Complete --timeout=20m "job/$BUILD_JOB" + kubectl -n "$PROJECT_NAMESPACE" logs "job/$BUILD_JOB" + + - name: Apply release manifests and wait for rollout + run: | + kubectl kustomize "$WORKSPACE_DIR/deploy/k8s/base" \ + | IMAGE="$IMAGE" python3 -c 'import os, sys; sys.stdout.write(sys.stdin.read().replace("registry.doran.133011.xyz/orderbooks:bootstrap", os.environ["IMAGE"]))' \ + | kubectl apply -f - + + printf '%s' "$PROJECT_DEPLOYMENTS" | tr ',' '\n' \ + | while IFS= read -r deployment; do + [ -n "$deployment" ] || continue + kubectl -n "$PROJECT_NAMESPACE" set image "deployment/$deployment" "*=$IMAGE" + kubectl -n "$PROJECT_NAMESPACE" rollout status "deployment/$deployment" --timeout=300s + done diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8e8cbb7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,43 @@ +# Local runtime data and evidence stay local +data/ +artifacts/ +reports/ +orchestration/ + +# Python/cache/build noise +__pycache__/ +*.py[cod] +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +*.egg-info/ +build/ +dist/ + +# Environments and local config +.venv/ +.env +*.env +!.dockerignore +!.gitignore + +# Kubernetes/rclone/secret material +kubeconfig* +*.kubeconfig +rclone.conf +**/rclone.conf +*.pem +*.key +*.p12 +*.pfx +id_rsa* +id_ed25519* +*mnemonic* +*wallet* +*credential* +*secret* + +# Editor/OS noise +.DS_Store +.idea/ +.vscode/ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..558c87b --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,91 @@ +# Agent Instructions + +Project: Cross-Market Live Orderbook Archive + +This repository exists to preserve live market microstructure data that is usually lost: order books, spreads, liquidity, depth, timestamps, request metadata, and enough raw context to later decide whether a trading idea was observable, fillable, and reproducible at the time. + +The first market is Polymarket. Future markets may include NEAR-related venues and other prediction or crypto markets, but do not build generic multi-market infrastructure before the second market exists. + +## Active Collaboration Model + +This project uses a two-role workflow: + +- `orchestrator`: coordinates checkpoints with the user, keeps scope narrow, records decisions, reviews evidence, states gates, and decides the next smallest step. +- `builder`: works in a separate session to implement the active checkpoint artifacts, run commands, collect evidence, and write manifests/reports. + +The current primary chat session is the `orchestrator`. The orchestrator should not silently become the builder unless the user explicitly asks. The builder should treat `AGENTS.md`, `ROADMAP.md`, `docs/METHODOLOGY.md`, and the active checkpoint report as the durable source of instructions. + +Hand-offs between orchestrator and builder must be written to disk under `orchestration/` or `reports/checkpoints/` when they contain decisions, scope changes, endpoint findings, or validation results. Chat-only instructions are not enough for project-critical state. + +## Non-Negotiable Rules + +1. Preserve raw data first. Raw API and websocket payloads are the source of truth. Derived datasets are secondary and must reference raw files. +2. No trading. Do not add order placement, signing, private-key handling, wallet logic, strategy execution, or bot behavior. +3. No secrets in the repo. Never commit API keys, rclone credentials, wallet material, cookies, or private endpoints. +4. Every checkpoint needs durable evidence on disk: code or docs, config or run instructions, manifest/report, and validation evidence. +5. Do not claim success without commands, outputs, files, checksums, or real collected data to support the claim. +6. Do not delete mistakes. If an artifact is wrong, misleading, partial, or deprecated, preserve it and label it with a reason and replacement. +7. Keep the scope narrow. No dashboard, database, ML, strategy, backtest, or generic framework until the roadmap gate allows it. +8. Public data only unless a later checkpoint explicitly documents why authenticated public-data access is required. +9. "Production-ready" is forbidden until the collector has completed a documented 24h soak test with acceptable quality. + +## Expected Workflow + +For each checkpoint: + +1. Define the smallest useful checkpoint. +2. Build only what is needed for that checkpoint. +3. Validate with real commands and, when applicable, real public data. +4. Write a machine-readable manifest and a short markdown note. +5. State PASS, FAIL, or BLOCKED. +6. Identify the strongest fake-progress risk. +7. Recommend the next smallest step. +8. Stop only when a real user or orchestrator decision is needed. + +## Repository Conventions + +- `scripts/`: executable probes, discovery scripts, collectors, normalizers, and upload helpers. +- `config/`: example configuration only. Real secrets and machine-local config stay outside git. +- `docs/`: durable methodology, data contracts, operational runbooks, and endpoint notes. +- `orchestration/prompts/`: prompts and templates used by future agents. +- `data/probes/`: bounded endpoint probe outputs and probe notes. +- `data/discovery/`: market discovery outputs and manifests. +- `data/live_sample/`: short sample collector runs. +- `data/normalized_sample/`: derived sample outputs generated from raw samples. +- `data/manifests/`: machine-readable manifests for probes, collectors, normalization, uploads, and checkpoints. +- `reports/`: human-readable checkpoint, soak test, and incident reports. +- `systemd/`: VPS runtime units when added. + +The initial Polymarket implementation should remain simple scripts until the collector works. Introduce `collectors//` only when adding a second market or when duplication proves painful. + +## Artifact Status Labels + +Every durable artifact should be treated as one of: + +- `valid`: current and usable. +- `partial`: useful but incomplete. +- `deprecated`: superseded by a newer artifact. +- `invalid`: known to be wrong or misleading. + +When marking an artifact `deprecated` or `invalid`, write a sibling markdown note or manifest entry with: + +- original artifact path +- status +- reason +- replacement path, if any +- labeled_at_utc +- labeled_by + +Do not remove the original artifact unless the user explicitly asks and there is a written reason. + +## Adding New Market Connectors Later + +Before adding a second market, Polymarket must have working discovery, raw order-book collection, Google Drive offload, and a 24h soak test. + +When the gate is met: + +1. Create `collectors//` for market-specific code. +2. Keep shared code minimal and concrete. +3. Reuse the same raw-first file layout and manifest format. +4. Document endpoint quirks, timestamp semantics, rate limits, and schema differences in `docs/`. +5. Avoid abstract base classes until at least two real collectors expose repeated code that is painful to maintain. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..086f077 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +FROM python:3.12-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + ORDERBOOKS_APP_DIR=/app \ + ORDERBOOKS_DATA_DIR=/var/lib/orderbooks \ + ORDERBOOKS_PYTHON=python3 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends bash ca-certificates rclone \ + && rm -rf /var/lib/apt/lists/* \ + && groupadd --system --gid 10001 orderbooks \ + && useradd --system --uid 10001 --gid 10001 --home-dir /var/lib/orderbooks --shell /usr/sbin/nologin orderbooks + +WORKDIR /app + +COPY AGENTS.md ROADMAP.md ./ +COPY config/ config/ +COPY docs/ docs/ +COPY scripts/ scripts/ + +RUN chmod +x scripts/*.sh \ + && mkdir -p /var/lib/orderbooks/discovery /var/lib/orderbooks/raw_orderbooks /var/lib/orderbooks/manifests \ + && chown -R orderbooks:orderbooks /var/lib/orderbooks /app + +USER 10001:10001 + +CMD ["/bin/bash", "/app/scripts/run_polymarket_collector_loop.sh"] diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 0000000..04e439e --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,212 @@ +# Roadmap + +Project: Cross-Market Live Orderbook Archive + +Goal: build a reliable, minimal, always-on archive of live market microstructure data so future research agents can test whether strategies were actually observable, fillable, and reproducible in real time. + +The roadmap is checkpoint-driven. Each checkpoint must leave durable artifacts, validation evidence, and an explicit gate result. + +## Current Status + +- Latest completed checkpoint: Checkpoint 7, Google Drive Offload +- Latest gate: PASS +- Next checkpoint: Checkpoint 8, 24h Soak Test Plan +- Initial market: Polymarket +- Future market work: gated until Polymarket is stable + +## Checkpoint 1: Project Scaffold And Methodology + +Goal: create the minimum repository structure and rules that keep future agents on track. + +Artifacts: + +- `AGENTS.md` +- `ROADMAP.md` +- `docs/METHODOLOGY.md` +- `docs/DATA_CONTRACT.md` +- `docs/OPERATIONS.md` +- `orchestration/prompts/` + +Requirements: + +- Define project goal. +- Define anti-fake-progress rules. +- Define raw-first storage policy. +- Define checkpoint reporting format. +- Define no-trading/no-private-key policy. +- Define how to label deprecated or misleading artifacts instead of deleting them. +- Define how new market connectors should be added later. + +Pass condition: the repo contains durable project rules and the next checkpoint is specific enough to execute. + +## Checkpoint 2: Polymarket Public Data Source Probe + +Goal: determine exactly which public Polymarket endpoints can support live collection. + +Questions: + +- How to discover active Polymarket markets? +- How to filter BTC up/down markets? +- How to resolve conditionId and token IDs? +- How to fetch current order book for one token? +- Is there a batch order-book endpoint? +- Is there a market websocket for order-book updates? +- Is there a trade websocket or recent trades endpoint? +- What rate limits are documented or observed? +- What fields are returned? +- What timestamps exist? + +Artifacts: + +- `scripts/probe_polymarket_public_sources.py` +- `data/probes/polymarket_public_sources_probe_v1.json` +- `data/probes/polymarket_public_sources_probe_v1.md` + +Pass condition: we know the exact endpoint set and can fetch at least one active market metadata record and one current order book. + +## Checkpoint 3: Minimal BTC Market Discovery + +Goal: build a small script that finds active BTC up/down Polymarket markets and resolves both outcome token IDs. + +Artifacts: + +- `scripts/discover_polymarket_btc_markets.py` +- `data/discovery/polymarket_btc_markets_latest.json` +- `data/discovery/polymarket_btc_markets_manifest.json` +- `data/discovery/polymarket_btc_markets.md` + +Requirements: + +- Public endpoints only. +- No trading. +- No API keys unless strictly needed for public data. +- Never store secrets in the repo. +- Preserve raw metadata responses. +- Write normalized market records with slug, question, conditionId, token IDs, outcomes, times, status, source, and `fetched_at_utc`. + +Pass condition: the script reliably outputs currently active BTC markets with token IDs. + +## Checkpoint 4: Minimal Orderbook Snapshot Collector + +Goal: collect raw order-book snapshots for active BTC markets at a fixed interval. + +Artifacts: + +- `scripts/collect_polymarket_orderbooks.py` +- `config/polymarket_collector.example.yaml` +- `data/live_sample/...` +- `data/manifests/orderbook_collector_sample_manifest.json` +- `docs/POLYMARKET_COLLECTOR.md` + +Requirements: + +- Collect active BTC markets only. +- Fetch order books for both outcome tokens. +- Store raw API responses as gzip JSONL. +- Add local `collected_at_utc`, collector version, endpoint URL, and request params. +- Rotate files by hour or run. +- Include a manifest with timing, markets, request counts, status codes, rows, output files, and checksums. +- Handle graceful shutdown and rate limits. +- Do not add a database. + +Pass condition: a 5-10 minute sample run creates valid compressed raw snapshots and a manifest. + +## Checkpoint 5: Normalized Snapshot Extract + +Goal: create a derived normalized dataset from raw snapshots while preserving raw files as source of truth. + +Artifacts: + +- `scripts/normalize_polymarket_orderbooks.py` +- `data/normalized_sample/...` +- `data/manifests/orderbook_normalization_sample_manifest.json` +- `docs/ORDERBOOK_SCHEMA.md` + +Pass condition: a sample raw file can be normalized and basic sanity checks pass. + +## Checkpoint 6: VPS Runtime Package + +Goal: make the collector deployable on a small VPS. + +Artifacts: + +- `systemd/polymarket-orderbook-collector.service` +- `config/polymarket_collector.vps.example.yaml` +- `scripts/run_polymarket_collector_cycle.sh` +- `docs/VPS_DEPLOYMENT.md` + +Uploader service and timer units are deferred to Checkpoint 7 with Google Drive +offload. Creating empty uploader units in Checkpoint 6 would be fake progress. + +Pass condition: a user can follow docs on a VPS and run the collector. + +## Checkpoint 7: Google Drive Offload + +Goal: add periodic upload to Google Drive using `rclone`. + +Artifacts: + +- `scripts/upload_archive_rclone.sh` +- `config/rclone.example.md` +- `docs/GOOGLE_DRIVE_OFFLOAD.md` +- sample upload manifest format + +Pass condition: a dry-run and a real small test upload succeed and are documented. + +## Checkpoint 8: 24h Soak Test Plan + +Goal: run the collector for a real 24h period and validate reliability. + +Artifacts: + +- `reports/soak_test_YYYY-MM-DD.md` +- `data/manifests/...` + +Metrics: + +- uptime +- markets tracked +- total snapshots +- missed interval estimate +- API errors +- rate limits +- file sizes +- compression ratio +- Google Drive upload status +- restart behavior +- disk usage +- data quality checks + +Pass condition: a 24h run completes with acceptable data quality and documented issues. + +## Checkpoint 9: Add Second Market Only After Polymarket Is Stable + +Goal: prepare for NEAR or another market only after Polymarket collector reliability is proven. + +Do not start this checkpoint until: + +- Polymarket discovery works. +- Polymarket order-book collection works. +- Google Drive offload works. +- The 24h soak test is complete. + +Architecture principles: + +- Use `collectors//` only when adding the second market. +- Keep shared code minimal. +- Avoid abstract base classes until duplication is painful. +- Keep raw-first, normalized-second, manifest-always file format consistent across markets. + +## Anti-Fake-Progress Gates + +- No dashboard before 24h data reliability. +- No database before the file archive becomes painful. +- No strategy or backtest code in this project. +- No live trading. +- No generic multi-market abstraction before the second market exists. +- No claiming "production-ready" before a 24h soak test. +- No deleting bad artifacts; label them deprecated or invalid and write why. + +## Next Smallest Step + +Checkpoint 2 is next. It should inspect official Polymarket docs and perform bounded public endpoint probes to determine the exact live collection sources, schemas, timestamps, and rate-limit behavior. diff --git a/config/polymarket_collector.example.yaml b/config/polymarket_collector.example.yaml new file mode 100644 index 0000000..3446dfc --- /dev/null +++ b/config/polymarket_collector.example.yaml @@ -0,0 +1,20 @@ +# Example config for the bounded Checkpoint 4 Polymarket order-book sample. +# This file contains no secrets. The collector reads only public endpoints. + +discovery_path: data/discovery/polymarket_btc_markets_latest.json +output_dir: data/live_sample +manifest_path: data/manifests/orderbook_collector_sample_manifest.json + +# Keep the default sample deliberately small to avoid unnecessary endpoint load. +market_limit: 2 +interval_seconds: 30 +duration_seconds: 300 + +clob_books_url: https://clob.polymarket.com/books +request_timeout_seconds: 15 +max_retries: 2 +backoff_seconds: 2 + +# Do not start tracking markets too close to their end time. Default covers +# the 5-minute sample duration plus a 2-minute buffer. +market_end_safety_seconds: 420 diff --git a/config/polymarket_collector.vps.example.yaml b/config/polymarket_collector.vps.example.yaml new file mode 100644 index 0000000..09353ce --- /dev/null +++ b/config/polymarket_collector.vps.example.yaml @@ -0,0 +1,17 @@ +# Checkpoint 6 VPS example config for the raw Polymarket order-book collector. +# Copy to /etc/orderbooks/polymarket_collector.vps.yaml on a VPS and edit paths +# if the service uses a different data directory. + +discovery_path: /var/lib/orderbooks/discovery/polymarket_btc_markets_latest.json +output_dir: /var/lib/orderbooks/raw_orderbooks +manifest_path: /var/lib/orderbooks/manifests/polymarket_orderbook_collector_latest.json + +market_limit: 2 +interval_seconds: 30 +duration_seconds: 300 +market_end_safety_seconds: 420 + +clob_books_url: https://clob.polymarket.com/books +request_timeout_seconds: 15 +max_retries: 2 +backoff_seconds: 2 diff --git a/config/rclone.example.md b/config/rclone.example.md new file mode 100644 index 0000000..70bd06e --- /dev/null +++ b/config/rclone.example.md @@ -0,0 +1,76 @@ +# rclone Configuration Example + +Status: valid + +This file documents the expected `rclone` setup for Checkpoint 7. It is not an +`rclone.conf` file and must not be copied into the repository with private auth +material. + +## Remote Name + +The examples use this remote path: + +```text +gdrive:orderbooks/polymarket +``` + +You may choose another remote name or folder. The uploader reads the destination +from: + +```text +ORDERBOOKS_RCLONE_DEST +``` + +For the systemd service, set it in: + +```text +/etc/orderbooks/orderbook-uploader.env +``` + +Example: + +```text +ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket +``` + +Do not place private auth files, browser tokens, API keys, wallet material, or +session material in this repository. + +## Configure Google Drive Outside The Repo + +Install `rclone` on the VPS, then configure the remote as the service user or +with a root-managed config path that the service can read: + +```sh +sudo apt-get install -y rclone +sudo -u orderbooks rclone config +sudo -u orderbooks rclone lsd gdrive: +``` + +If the service user uses the default rclone config path, keep that file outside +the repository under the service user's home/config directory. + +## Uploader Environment File + +Create: + +```text +/etc/orderbooks/orderbook-uploader.env +``` + +Minimal example: + +```text +ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket +``` + +Optional overrides: + +```text +ORDERBOOKS_UPLOAD_DATA_DIR=/var/lib/orderbooks +ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600 +ORDERBOOKS_UPLOAD_RETENTION_DAYS=7 +ORDERBOOKS_RCLONE_BIN=/usr/bin/rclone +``` + +The environment file belongs on the VPS. Do not commit a machine-local version. diff --git a/deploy/k8s/base/configmap.yaml b/deploy/k8s/base/configmap.yaml new file mode 100644 index 0000000..8b33cf3 --- /dev/null +++ b/deploy/k8s/base/configmap.yaml @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: orderbooks-collector-config + namespace: orderbooks + labels: + app.kubernetes.io/name: orderbooks + app.kubernetes.io/part-of: orderbooks + app.kubernetes.io/component: collector + app.kubernetes.io/managed-by: kustomize +data: + polymarket_collector.yaml: | + discovery_path: /var/lib/orderbooks/discovery/polymarket_btc_markets_latest.json + output_dir: /var/lib/orderbooks/raw_orderbooks + manifest_path: /var/lib/orderbooks/manifests/polymarket_orderbook_collector_latest.json + + market_limit: 2 + interval_seconds: 30 + duration_seconds: 300 + market_end_safety_seconds: 420 + + clob_books_url: https://clob.polymarket.com/books + request_timeout_seconds: 15 + max_retries: 2 + backoff_seconds: 2 diff --git a/deploy/k8s/base/cronjob-uploader.yaml b/deploy/k8s/base/cronjob-uploader.yaml new file mode 100644 index 0000000..2a11c4a --- /dev/null +++ b/deploy/k8s/base/cronjob-uploader.yaml @@ -0,0 +1,92 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: orderbooks-uploader + namespace: orderbooks + labels: + app.kubernetes.io/name: orderbooks + app.kubernetes.io/part-of: orderbooks + app.kubernetes.io/component: uploader +spec: + schedule: "*/15 * * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 86400 + template: + metadata: + labels: + app.kubernetes.io/name: orderbooks + app.kubernetes.io/part-of: orderbooks + app.kubernetes.io/component: uploader + spec: + restartPolicy: Never + imagePullSecrets: + - name: orderbooks-registry-creds + securityContext: + runAsNonRoot: true + runAsUser: 10001 + runAsGroup: 10001 + fsGroup: 10001 + fsGroupChangePolicy: OnRootMismatch + containers: + - name: uploader + image: registry.doran.133011.xyz/orderbooks:bootstrap + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - /app/scripts/upload_archive_rclone.sh + - --execute + env: + - name: ORDERBOOKS_DATA_DIR + value: /var/lib/orderbooks + - name: ORDERBOOKS_UPLOAD_DATA_DIR + value: /var/lib/orderbooks + - name: ORDERBOOKS_UPLOAD_RAW_DIR + value: /var/lib/orderbooks/raw_orderbooks + - name: ORDERBOOKS_UPLOAD_SOURCE_MANIFEST_DIR + value: /var/lib/orderbooks/manifests + - name: ORDERBOOKS_UPLOAD_MANIFEST_DIR + value: /var/lib/orderbooks/manifests + - name: ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS + value: "600" + - name: ORDERBOOKS_UPLOAD_RETENTION_DAYS + value: "7" + - name: ORDERBOOKS_RCLONE_BIN + value: /usr/bin/rclone + - name: ORDERBOOKS_RCLONE_DEST + value: gdrive:orderbooks/polymarket + - name: RCLONE_CONFIG + value: /etc/rclone/rclone.conf + volumeMounts: + - name: orderbooks-data + mountPath: /var/lib/orderbooks + - name: rclone-config + mountPath: /etc/rclone/rclone.conf + subPath: rclone.conf + readOnly: true + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumes: + - name: orderbooks-data + persistentVolumeClaim: + claimName: orderbooks-data + - name: rclone-config + secret: + secretName: orderbooks-rclone-config + items: + - key: rclone.conf + path: rclone.conf diff --git a/deploy/k8s/base/deployment-collector.yaml b/deploy/k8s/base/deployment-collector.yaml new file mode 100644 index 0000000..7878469 --- /dev/null +++ b/deploy/k8s/base/deployment-collector.yaml @@ -0,0 +1,86 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: orderbooks-collector + namespace: orderbooks + labels: + app.kubernetes.io/name: orderbooks + app.kubernetes.io/part-of: orderbooks + app.kubernetes.io/component: collector +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app.kubernetes.io/name: orderbooks + app.kubernetes.io/component: collector + template: + metadata: + labels: + app.kubernetes.io/name: orderbooks + app.kubernetes.io/part-of: orderbooks + app.kubernetes.io/component: collector + spec: + terminationGracePeriodSeconds: 120 + imagePullSecrets: + - name: orderbooks-registry-creds + securityContext: + runAsNonRoot: true + runAsUser: 10001 + runAsGroup: 10001 + fsGroup: 10001 + fsGroupChangePolicy: OnRootMismatch + containers: + - name: collector + image: registry.doran.133011.xyz/orderbooks:bootstrap + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - /app/scripts/run_polymarket_collector_loop.sh + env: + - name: ORDERBOOKS_APP_DIR + value: /app + - name: ORDERBOOKS_PYTHON + value: python3 + - name: ORDERBOOKS_DATA_DIR + value: /var/lib/orderbooks + - name: ORDERBOOKS_COLLECTOR_CONFIG + value: /etc/orderbooks/polymarket_collector.yaml + - name: ORDERBOOKS_DISCOVERY_DIR + value: /var/lib/orderbooks/discovery + - name: ORDERBOOKS_OUTPUT_DIR + value: /var/lib/orderbooks/raw_orderbooks + - name: ORDERBOOKS_MANIFEST_DIR + value: /var/lib/orderbooks/manifests + - name: ORDERBOOKS_LOOP_SLEEP_SECONDS + value: "15" + volumeMounts: + - name: orderbooks-data + mountPath: /var/lib/orderbooks + - name: collector-config + mountPath: /etc/orderbooks/polymarket_collector.yaml + subPath: polymarket_collector.yaml + readOnly: true + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumes: + - name: orderbooks-data + persistentVolumeClaim: + claimName: orderbooks-data + - name: collector-config + configMap: + name: orderbooks-collector-config + items: + - key: polymarket_collector.yaml + path: polymarket_collector.yaml diff --git a/deploy/k8s/base/kustomization.yaml b/deploy/k8s/base/kustomization.yaml new file mode 100644 index 0000000..010b89a --- /dev/null +++ b/deploy/k8s/base/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: orderbooks +resources: + - namespace.yaml + - configmap.yaml + - pvc.yaml + - deployment-collector.yaml + - cronjob-uploader.yaml diff --git a/deploy/k8s/base/namespace.yaml b/deploy/k8s/base/namespace.yaml new file mode 100644 index 0000000..fbd6526 --- /dev/null +++ b/deploy/k8s/base/namespace.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: orderbooks + labels: + app.kubernetes.io/name: orderbooks + app.kubernetes.io/part-of: orderbooks diff --git a/deploy/k8s/base/pvc.yaml b/deploy/k8s/base/pvc.yaml new file mode 100644 index 0000000..678b6a5 --- /dev/null +++ b/deploy/k8s/base/pvc.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: orderbooks-data + namespace: orderbooks + labels: + app.kubernetes.io/name: orderbooks + app.kubernetes.io/part-of: orderbooks +spec: + accessModes: + - ReadWriteOnce + storageClassName: local-path + resources: + requests: + storage: 10Gi diff --git a/docs/DATA_CONTRACT.md b/docs/DATA_CONTRACT.md new file mode 100644 index 0000000..fffe7f1 --- /dev/null +++ b/docs/DATA_CONTRACT.md @@ -0,0 +1,168 @@ +# Data Contract + +The archive is raw-first. Raw market data must be preserved before normalization, aggregation, upload, or analysis. + +## Storage Principles + +- Store the raw response payload exactly as received whenever practical. +- Add collector metadata beside the raw payload, not inside it. +- Use UTC timestamps in ISO 8601 format with a `Z` suffix. +- Use gzip JSONL for high-frequency snapshot data. +- Rotate live collection files by hour or run. +- Include checksums in manifests for all closed files. +- Keep normalized files derived and traceable back to raw files. +- Never store secrets, cookies, private keys, wallet material, or authenticated session state. + +## Directory Layout + +Initial expected layout: + +```text +data/ + probes/ + discovery/ + live_sample/ + normalized_sample/ + manifests/ +reports/ + checkpoints/ +``` + +Future sustained collection layout: + +```text +data/ + raw/ + polymarket/ + orderbooks/ + YYYY/ + MM/ + DD/ + HH/ + polymarket_orderbooks_YYYYMMDDTHHMMSSZ.jsonl.gz + normalized/ + polymarket/ + orderbooks/ + YYYY/ + MM/ + DD/ + polymarket_orderbooks_normalized_YYYYMMDD.jsonl.gz + manifests/ +``` + +Do not create a database until compressed file archives are proven painful. + +## Raw Orderbook Snapshot Envelope + +Checkpoint 4 should store one JSON object per line using this envelope or a documented successor: + +```json +{ + "schema_name": "raw_orderbook_snapshot", + "schema_version": 1, + "collector": { + "name": "polymarket_orderbook_collector", + "version": "0.1.0" + }, + "market": { + "market_name": "polymarket", + "market_slug": "example-slug", + "condition_id": "0x...", + "token_id": "123", + "outcome": "Yes" + }, + "collection": { + "collected_at_utc": "2026-04-14T20:53:49Z", + "sequence": 1 + }, + "request": { + "method": "GET", + "url": "https://example.invalid/orderbook", + "params": { + "token_id": "123" + }, + "status_code": 200, + "duration_ms": 123 + }, + "raw": {} +} +``` + +`raw` is the unmodified response payload. If the endpoint returns text or bytes, record encoding and store a lossless representation. + +## Discovery Record Fields + +Checkpoint 3 normalized market records should include: + +- `market_name` +- `market_slug` +- `title` or `question` +- `condition_id` +- `tokens` +- `outcomes` +- `start_time_utc`, if available +- `end_time_utc`, if available +- `active` +- `closed` +- `endpoint_source` +- `fetched_at_utc` +- `raw_ref` + +`tokens` should preserve the mapping between outcome labels and token IDs. + +## Normalized Snapshot Fields + +Checkpoint 5 normalized records should include: + +- `market_name` +- `market_slug` +- `condition_id` +- `token_id` +- `outcome` +- `collected_at_utc` +- `best_bid` +- `best_ask` +- `spread` +- `midpoint` +- `bid_depth_total` +- `ask_depth_total` +- `bid_depth_within_1c` +- `ask_depth_within_1c` +- `bid_depth_within_2c` +- `ask_depth_within_2c` +- `bid_depth_within_5c` +- `ask_depth_within_5c` +- `raw_file` +- `raw_line_number`, when feasible + +Normalized data is invalid if it cannot reference the raw source record. + +## Manifest Requirements + +Collection and transformation manifests should include: + +- manifest schema name and version +- checkpoint or process name +- start and end timestamps +- market names and market IDs tracked +- input files +- output files +- request counts +- success and failure counts +- status-code counts +- row counts +- checksums for closed files +- command used +- config path or config digest +- warnings and known gaps +- gate status + +Checksums should use SHA-256 unless a later report explains why another hash is used. + +## Timestamp Policy + +- `collected_at_utc`: local collector timestamp taken as close as possible to receipt of data. +- `fetched_at_utc`: timestamp for metadata or discovery fetches. +- Endpoint-provided timestamps must be preserved under their original field names in `raw`. +- If endpoint timestamp semantics are unclear, write the ambiguity into the probe report. + diff --git a/docs/GOOGLE_DRIVE_OFFLOAD.md b/docs/GOOGLE_DRIVE_OFFLOAD.md new file mode 100644 index 0000000..a61dbcd --- /dev/null +++ b/docs/GOOGLE_DRIVE_OFFLOAD.md @@ -0,0 +1,294 @@ +# Google Drive Offload + +Status: valid + +This document covers Checkpoint 7: offloading closed raw collector files and +manifests to Google Drive with `rclone`. + +This checkpoint does not prove production readiness or 24/7 reliability. A real +small upload must be run with a configured remote, and the later 24h soak test +must still pass. + +## Scope + +Included: + +- `scripts/upload_archive_rclone.sh` +- `systemd/polymarket-orderbook-uploader.service` +- `systemd/polymarket-orderbook-uploader.timer` +- dry-run mode by default +- real upload only with `--execute` +- rclone verification with `rclone check` +- per-run upload manifests +- optional local cleanup only after successful verification + +Excluded: + +- dashboards +- databases +- strategies or backtests +- trading, signing, order placement, or wallet logic +- hardcoded private auth material + +## Install rclone + +On Ubuntu or Debian: + +```sh +sudo apt-get update +sudo apt-get install -y rclone +``` + +Confirm: + +```sh +rclone version +``` + +## Configure A Google Drive Remote + +Configure the remote outside this repository. For a service-user setup: + +```sh +sudo -u orderbooks rclone config +sudo -u orderbooks rclone lsd gdrive: +``` + +The example remote path is: + +```text +gdrive:orderbooks/polymarket +``` + +Any valid `rclone` destination may be used. The uploader reads it from: + +```text +ORDERBOOKS_RCLONE_DEST +``` + +For systemd, create: + +```text +/etc/orderbooks/orderbook-uploader.env +``` + +Example: + +```text +ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket +``` + +Do not commit the machine-local rclone config or any private auth material. + +## What Gets Uploaded + +By default the script targets: + +| Source | Default path | +| --- | --- | +| raw collector files | `/var/lib/orderbooks/raw_orderbooks` | +| collector manifests | `/var/lib/orderbooks/manifests` | + +It does not target normalized sample files by default. + +Files modified within the last 10 minutes are skipped to avoid active collector +files: + +```text +ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600 +``` + +The script preserves repository/data-directory relative paths on the remote. For +example: + +```text +/var/lib/orderbooks/raw_orderbooks/polymarket/orderbooks//file.jsonl.gz +``` + +uploads to: + +```text +/raw_orderbooks/polymarket/orderbooks//file.jsonl.gz +``` + +## Dry Run + +Dry-run is the default. It plans files, stages a temporary copy, invokes +`rclone copy --dry-run`, and writes an upload manifest. + +Example for a VPS: + +```sh +/opt/orderbooks/scripts/upload_archive_rclone.sh \ + --data-dir /var/lib/orderbooks \ + --dest "$ORDERBOOKS_RCLONE_DEST" +``` + +Example against the repository sample data: + +```sh +scripts/upload_archive_rclone.sh \ + --data-dir data \ + --dest gdrive:orderbooks/polymarket/checkpoint7-test \ + --manifest-path data/manifests/upload_archive_real_test_dry_run_manifest.json \ + --min-age-seconds 0 \ + --rclone-bin /usr/bin/rclone +``` + +Dry-run does not prove remote write access. + +## Execute Upload + +Run a real upload only after the remote is configured and the dry-run plan looks +right: + +```sh +/opt/orderbooks/scripts/upload_archive_rclone.sh \ + --execute \ + --data-dir /var/lib/orderbooks \ + --dest "$ORDERBOOKS_RCLONE_DEST" +``` + +The script runs: + +```text +rclone copy --checksum +rclone check --one-way --checksum +``` + +The upload gate is `PASS` only when the copy succeeds and verification succeeds. + +## Retention And Cleanup + +Local files are kept by default, even after upload verification. + +Cleanup requires an explicit flag: + +```sh +/opt/orderbooks/scripts/upload_archive_rclone.sh \ + --execute \ + --cleanup-after-verify \ + --retention-days 7 \ + --data-dir /var/lib/orderbooks \ + --dest "$ORDERBOOKS_RCLONE_DEST" +``` + +Cleanup deletes only files that were selected for upload, uploaded, verified, and +older than the retention window. The default retention window is 7 days. + +## Upload Manifest + +Each run writes a manifest such as: + +```text +/var/lib/orderbooks/manifests/upload_archive_YYYYMMDDTHHMMSSZ.json +``` + +The manifest records: + +- planned files +- attempted files +- dry-run files +- uploaded files +- verified files +- skipped open or recent files +- retained local files +- deleted local files +- SHA-256 checksums +- command mode +- start/end time +- rclone copy/check exit codes +- gate status + +For this repository, the sample manifest path is: + +```text +data/manifests/upload_archive_sample_manifest.json +``` + +The verified Checkpoint 7 real-test manifest is: + +```text +data/manifests/upload_archive_real_test_manifest.json +``` + +## systemd Timer + +Install the unit files: + +```sh +sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.service /etc/systemd/system/polymarket-orderbook-uploader.service +sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.timer /etc/systemd/system/polymarket-orderbook-uploader.timer +sudo systemctl daemon-reload +``` + +Create the environment file: + +```sh +sudo install -o root -g orderbooks -m 0640 /dev/null /etc/orderbooks/orderbook-uploader.env +sudo editor /etc/orderbooks/orderbook-uploader.env +``` + +At minimum, set: + +```text +ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket +``` + +Enable the timer: + +```sh +sudo systemctl enable --now polymarket-orderbook-uploader.timer +``` + +Run one upload immediately: + +```sh +sudo systemctl start polymarket-orderbook-uploader.service +``` + +## Logs + +Use the systemd journal: + +```sh +sudo systemctl status polymarket-orderbook-uploader.service +sudo journalctl -u polymarket-orderbook-uploader.service -f +sudo systemctl list-timers polymarket-orderbook-uploader.timer +``` + +## Current Checkpoint 7 Result + +Initial local validation was blocked when `rclone` was unavailable. That blocked +manifest remains at: + +```text +data/manifests/upload_archive_sample_manifest.json +``` + +After `rclone` was configured as `/usr/bin/rclone` with remote `gdrive:`, a dry +run and one tiny real upload were run against: + +```text +gdrive:orderbooks/polymarket/checkpoint7-test +``` + +The real upload manifest records `rclone copy` exit code 0 and `rclone check` +exit code 0: + +```text +data/manifests/upload_archive_real_test_manifest.json +``` + +Current gate: + +```text +PASS +``` + +## What Remains Unproven + +- Long-run upload reliability. +- Interaction between hourly uploads and a 24h collector soak test. +- Retention cleanup after verified upload. +- Production readiness. diff --git a/docs/KUBERNETES_DEPLOYMENT.md b/docs/KUBERNETES_DEPLOYMENT.md new file mode 100644 index 0000000..2a0a697 --- /dev/null +++ b/docs/KUBERNETES_DEPLOYMENT.md @@ -0,0 +1,148 @@ +# Kubernetes Deployment + +Status: draft runtime package for Checkpoint 8G + +This document describes the Kubernetes package for the Polymarket raw +order-book collector. It follows the shared Hetzner k3s cluster model from +`../nuri/unrip3`: application code, Dockerfile, manifests, and Forgejo workflow +live in this repository; platform services, the shared registry, and the shared +Forgejo runner remain platform-owned. + +This package does not claim production readiness. Production readiness still +requires a real Kubernetes runtime smoke run with preserved evidence. + +## Cluster Decisions + +- Namespace: `orderbooks` +- Workstation kubeconfig for validation: `../nuri/unrip3/.state/hetzner/kubeconfig.yaml` +- Shared registry and shared Forgejo runner +- Existing rclone Secret: `orderbooks/orderbooks-rclone-config` +- Secret key mounted by the uploader: `rclone.conf` + +Do not commit or print rclone config contents. + +## Runtime Layout + +The collector and uploader share one PVC: + +```text +PVC: orderbooks-data +mount: /var/lib/orderbooks +raw files: /var/lib/orderbooks/raw_orderbooks +manifests: /var/lib/orderbooks/manifests +discovery: /var/lib/orderbooks/discovery +``` + +The collector uses one Deployment with one replica. The container runs +`/app/scripts/run_polymarket_collector_loop.sh`, which repeatedly executes the +existing bounded collector cycle and records loop failure/interruption manifests +instead of relying on Kubernetes crash loops for normal operation. + +The uploader uses one CronJob. It runs the existing rclone uploader in execute +mode, mounts the same PVC, mounts `orderbooks-rclone-config` read-only at +`/etc/rclone/rclone.conf`, sets `RCLONE_CONFIG` to that file, and uploads only +closed/aged files. + + +## Bootstrap This App Repo + +Run the orderbooks-specific bootstrap from this repository: + +```sh +scripts/deploy/bootstrap_orderbooks_k8s.sh +``` + +The bootstrap loads platform defaults and resolved secrets from the local +platform state without printing secret values. It ensures namespace `orderbooks`, +creates or updates `orderbooks-registry-creds`, verifies the existing +`orderbooks-rclone-config` secret has key `rclone.conf`, creates or updates the +Forgejo repo `philipp/orderbooks`, and upserts the required Actions secret and +variables. + +After bootstrap, push a clean source tree to Forgejo `main`. Do not push local +`data/`, `artifacts/`, `reports/`, `orchestration/`, kubeconfigs, rclone config, +`.env`, private keys, or other local evidence/secrets. + +## Image Build And Deploy + +The Forgejo workflow is `.forgejo/workflows/deploy.yml`. It follows the shared +runner pattern: + +1. load `KUBECONFIG_B64` from Forgejo secrets; +2. clone this repo inside the runner; +3. create an in-cluster Kaniko Job; +4. build and push `REGISTRY_HOST/orderbooks:`; +5. apply `deploy/k8s/base` with the built image; +6. wait for `deployment/orderbooks-collector` rollout. + +Required Forgejo repo secret: + +```text +KUBECONFIG_B64 +``` + +Required Forgejo repo variable: + +```text +REGISTRY_HOST +``` + +Project defaults used by the workflow: + +```text +PROJECT_NAME=orderbooks +PROJECT_NAMESPACE=orderbooks +PROJECT_DEPLOYMENTS=orderbooks-collector +PROJECT_REGISTRY_SECRET_NAME=orderbooks-registry-creds +``` + +The registry pull/build secret `orderbooks-registry-creds` must exist in the +`orderbooks` namespace before the workflow builds and deploys. + +## Pre-Deploy Validation + +From this repository: + +```sh +bash -n scripts/run_polymarket_collector_loop.sh +bash -n scripts/k8s_runtime_smoke_check.sh +kubectl kustomize deploy/k8s/base +KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml kubectl apply -k deploy/k8s/base --dry-run=server +KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml kubectl -n orderbooks get secret orderbooks-rclone-config -o go-template='{{if index .data "rclone.conf"}}rclone_secret_key_present{{else}}rclone_secret_key_missing{{end}}{{"\n"}}' +``` + +The last command checks only whether the key exists. It must not print secret +data. + +## Runtime Smoke Gate + +After the image is built and the workload is actually deployed, run: + +```sh +KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml scripts/k8s_runtime_smoke_check.sh --namespace orderbooks --deployment orderbooks-collector --cronjob orderbooks-uploader --raw-dir /var/lib/orderbooks/raw_orderbooks --manifest-dir /var/lib/orderbooks/manifests --wait-seconds 1800 \ + --upload-min-age-seconds 600 +``` + +The smoke gate uses `kubectl`, not systemd. It writes local JSON evidence under +`data/manifests/k8s_runtime_smoke_.json` by default. It verifies: + +- collector pod is running; +- latest collector manifest has `gate_status: PASS`, `rows_written > 0`, and + `failure_count: 0`; +- raw gzip JSONL parses and is under `/var/lib/orderbooks/raw_orderbooks`; +- deleting the collector pod does not corrupt the old raw file checksum or row + count; +- a later post-restart collector cycle writes valid rows; +- an uploader Job created from the CronJob completes; +- the latest upload manifest records a verified rclone upload with at least one + verified file. + +A failed smoke run still writes JSON evidence and exits nonzero. Preserve failed +manifests, raw files, upload manifests, and pod logs for review. + +## Not Included + +- No trading, signing, wallets, private keys, or API keys. +- No dashboard, database, strategy, backtest, or second-market connector. +- No websocket rewrite. +- No rclone config contents in this repository. diff --git a/docs/METHODOLOGY.md b/docs/METHODOLOGY.md new file mode 100644 index 0000000..f342405 --- /dev/null +++ b/docs/METHODOLOGY.md @@ -0,0 +1,104 @@ +# Methodology + +This project uses checkpoint-driven compound engineering. The point is to preserve useful data and operational learning, not to accumulate scaffolding. + +## Checkpoint Cycle + +Every checkpoint follows the same loop: + +1. Define the smallest useful checkpoint. +2. Build only what is required for that checkpoint. +3. Validate with real commands and real data when applicable. +4. Write durable artifacts: code or docs, config or run instructions, manifest/report, and validation evidence. +5. State `PASS`, `FAIL`, or `BLOCKED`. +6. Identify the strongest fake-progress risk. +7. Recommend the next smallest step. +8. Stop and ask only when a real decision is needed. + +## Gate States + +- `PASS`: the checkpoint pass condition is met and evidence is on disk. +- `FAIL`: the checkpoint was attempted but did not meet its pass condition. +- `BLOCKED`: work cannot continue without a decision, credential, service, or unavailable dependency. +- `PARTIAL`: useful artifacts exist, but the checkpoint should not be treated as passed. + +## Evidence Rules + +- Evidence must be reproducible from files and commands, not just chat. +- If a command was used to validate behavior, record the command and summarize the result in a report or manifest. +- If data was collected, preserve raw data and include checksums. +- If synthetic or sample data is used, label it explicitly. +- If a claim depends on a public endpoint, record the endpoint, request parameters, response fields, status codes, timestamps, and fetch time. +- Do not claim reliability from a short sample run. Reliability requires the roadmap soak test. + +## Machine-Readable Manifest Format + +Checkpoint manifests should be JSON and stored under `data/manifests/`. Use this shape unless a later checkpoint documents a better schema: + +```json +{ + "checkpoint_id": 1, + "checkpoint_name": "Project Scaffold And Methodology", + "status": "PASS", + "started_at_utc": "2026-04-14T20:53:49Z", + "ended_at_utc": "2026-04-14T20:53:49Z", + "scope": "Durable project rules and roadmap only; no collector implementation.", + "artifacts": [ + { + "path": "AGENTS.md", + "kind": "project_rules", + "status": "valid" + } + ], + "validation": { + "commands": [ + { + "command": "git status --short", + "result": "completed" + } + ], + "summary": "Required files exist and contain checkpoint rules." + }, + "decisions": [], + "assumptions": [], + "fake_progress_risk": "Most progress is documentation until public Polymarket endpoint behavior is proven.", + "next_step": "Run Checkpoint 2 public source probe." +} +``` + +## Markdown Checkpoint Report Format + +Checkpoint reports should be stored under `reports/checkpoints/` and include: + +- active checkpoint +- scope +- files created or changed +- validation commands and results +- project rules or operational lessons added +- pass/fail/gate +- strongest fake-progress risk +- next smallest step + +## Deprecated Or Misleading Artifacts + +Do not delete mistakes. Preserve the original artifact and label it. + +Preferred labels: + +- Add a manifest entry with `status: "deprecated"` or `status: "invalid"`. +- Add a sibling note named `.deprecated.md` or `.invalid.md` when a human explanation is useful. +- Include why the artifact is wrong, when it was labeled, who labeled it, and what replaces it. + +If an artifact is dangerous because it contains secrets, stop and ask the user. Do not spread or copy the secret into reports. + +## Anti-Fake-Progress Rules + +- No dashboard before 24h data reliability. +- No database before plain compressed files become painful. +- No strategy, backtest, optimizer, or trading bot code. +- No private-key or signing code. +- No generic multi-market abstraction before a second market exists. +- No "production-ready" claim before a 24h soak test. +- No endpoint assumptions without probe evidence. +- No normalized dataset that cannot trace back to raw records. + diff --git a/docs/OPERATIONS.md b/docs/OPERATIONS.md new file mode 100644 index 0000000..d392da9 --- /dev/null +++ b/docs/OPERATIONS.md @@ -0,0 +1,93 @@ +# Operations + +This document defines operational rules before the collector exists. It should be updated with exact commands as checkpoints add scripts, services, and upload jobs. + +## Current Operational Status + +- Collector implementation: not started. +- Supported market: none yet; Polymarket is the first planned market. +- Deployment target: small VPS. +- Offload target: Google Drive through `rclone`. +- Reliability status: not production-ready until a documented 24h soak test passes. + +## Safety Rules + +- No trading. +- No order placement. +- No wallet signing. +- No private keys. +- No secrets in git. +- No dashboards, databases, ML, or strategy code before the roadmap gate allows them. + +## Local Runtime Principles + +Future scripts should: + +- accept a configurable data directory +- write logs to a predictable location +- write raw gzip JSONL snapshots +- rotate files by hour or run +- close files cleanly on shutdown +- write manifests after runs +- avoid corrupting closed files on restart +- handle public endpoint errors and rate limits conservatively + +## VPS Deployment Principles + +Checkpoint 6 should document: + +- Python version and virtualenv setup +- package installation +- environment variables +- systemd or Docker Compose runtime +- service user and file permissions +- data directory ownership +- log locations +- restart policy +- disk usage checks +- safe upgrade and rollback steps + +## Google Drive Offload Principles + +Checkpoint 7 should use `rclone` and must: + +- avoid hardcoded credentials +- upload only closed or rotated files +- support dry-run mode +- verify upload success +- preserve local files until upload is verified +- maintain checksums +- keep the last N days locally +- write an upload manifest + +## Incident And Bad-Data Handling + +If data looks wrong: + +1. Preserve the raw files. +2. Stop relying on the affected derived files. +3. Label the artifact `invalid` or `deprecated`. +4. Write a short note explaining the issue and replacement, if any. +5. Keep the learning in docs or reports. + +Examples of bad-data conditions: + +- endpoint returned a schema different from expected +- token/outcome mapping was wrong +- timestamps were misunderstood +- rate limits caused large gaps +- gzip file was not closed cleanly +- upload succeeded but checksum did not match + +## Minimum Reliability Claim + +A short sample run can prove that code writes files. It cannot prove 24/7 reliability. + +The project may only claim production readiness after: + +- discovery works +- raw order-book collection works +- offload works +- 24h soak test completes +- data quality and gap metrics are documented + diff --git a/docs/ORDERBOOK_SCHEMA.md b/docs/ORDERBOOK_SCHEMA.md new file mode 100644 index 0000000..a22c45b --- /dev/null +++ b/docs/ORDERBOOK_SCHEMA.md @@ -0,0 +1,102 @@ +# Orderbook Snapshot Schema + +Status: valid + +This document covers the Checkpoint 5 normalized order-book sample. The raw +gzip JSONL files remain the source of truth. Normalized rows are derived records +for quick inspection and later quality checks. + +## Normalized Snapshot + +Schema name: `normalized_orderbook_snapshot` + +Schema version: `1` + +File format: gzip JSONL, one JSON object per line. + +Sample location: + +```text +data/normalized_sample/polymarket/orderbooks//polymarket_orderbooks_normalized_.jsonl.gz +``` + +Every normalized row must reference exactly one raw gzip JSONL source row: + +- `raw_file`: repository-relative path to the raw gzip JSONL file. +- `raw_line_number`: 1-based line number inside that raw gzip JSONL file. + +Derived data is invalid if either lineage field is missing or points to a +missing raw file. + +## Field Contract + +| Field | Type | Meaning | +| --- | --- | --- | +| `schema_name` | string | Always `normalized_orderbook_snapshot`. | +| `schema_version` | number | Schema version, currently `1`. | +| `market_name` | string | Market source name from the raw envelope. | +| `market_slug` | string | Polymarket market slug from the raw envelope. | +| `condition_id` | string | Polymarket condition ID from the raw envelope. | +| `token_id` | string | Polymarket CLOB token ID from the raw envelope. | +| `outcome` | string | Outcome label associated with `token_id`. | +| `collected_at_utc` | string | Collector timestamp from the raw envelope. | +| `best_bid` | string or null | Maximum bid price, or null when no bids exist. | +| `best_ask` | string or null | Minimum ask price, or null when no asks exist. | +| `spread` | string or null | `best_ask - best_bid` when both sides exist. | +| `midpoint` | string or null | `(best_bid + best_ask) / 2` when both sides exist. | +| `bid_depth_total` | string | Sum of all bid sizes. | +| `ask_depth_total` | string | Sum of all ask sizes. | +| `bid_depth_within_1c` | string | Sum of bid sizes priced at least `best_bid - 0.01`. | +| `ask_depth_within_1c` | string | Sum of ask sizes priced at most `best_ask + 0.01`. | +| `bid_depth_within_2c` | string | Sum of bid sizes priced at least `best_bid - 0.02`. | +| `ask_depth_within_2c` | string | Sum of ask sizes priced at most `best_ask + 0.02`. | +| `bid_depth_within_5c` | string | Sum of bid sizes priced at least `best_bid - 0.05`. | +| `ask_depth_within_5c` | string | Sum of ask sizes priced at most `best_ask + 0.05`. | +| `raw_file` | string | Repository-relative raw gzip JSONL path. | +| `raw_line_number` | number | 1-based source line number in `raw_file`. | + +## Numeric Encoding + +Prices and sizes are parsed with Python `Decimal`. Derived numeric values are +emitted as exact decimal strings rather than JSON numbers. This keeps precision +visible and avoids binary floating-point rounding. + +Missing price-derived values are emitted as `null`. Depth totals and depth bands +are emitted as decimal strings and use `"0"` when the relevant side is empty. + +## Calculation Rules + +- `best_bid`: maximum bid price. +- `best_ask`: minimum ask price. +- `spread`: `best_ask - best_bid` when both sides exist. +- `midpoint`: `(best_bid + best_ask) / 2` when both sides exist. +- `bid_depth_total`: sum of all bid sizes. +- `ask_depth_total`: sum of all ask sizes. +- `bid_depth_within_1c`: sum bid sizes with price greater than or equal to + `best_bid - 0.01`. +- `ask_depth_within_1c`: sum ask sizes with price less than or equal to + `best_ask + 0.01`. +- The same band rule is used for `0.02` and `0.05`. + +## Sanity Rules + +A normalized file should pass these checks: + +- Output row count equals raw input row count unless skipped rows are recorded. +- Every row has `raw_file` and `raw_line_number`. +- Every referenced raw file exists. +- `spread` is non-negative whenever both sides exist. +- `midpoint` is between `best_bid` and `best_ask` whenever both sides exist. +- Depth totals and band depths are non-negative. +- At least one `Up` row and one `Down` row exist in the sample. +- The gzip JSONL file decompresses and every line parses as JSON. +- The manifest checksum matches the normalized output file. + +## Current Known Gaps + +- This schema covers a derived sample extract only. +- It does not define sustained daily normalized partitions. +- It does not include upload, daemon runtime, dashboards, databases, strategy + code, backtests, trading behavior, or wallet behavior. +- Long-run schema stability still depends on future collection and soak-test + evidence. diff --git a/docs/POLYMARKET_COLLECTOR.md b/docs/POLYMARKET_COLLECTOR.md new file mode 100644 index 0000000..e39c750 --- /dev/null +++ b/docs/POLYMARKET_COLLECTOR.md @@ -0,0 +1,149 @@ +# Polymarket Collector + +Artifact status: `valid` + +## Scope + +This document covers the Checkpoint 4 bounded raw order-book sample collector. + +It does not describe a production service. It does not include normalization, upload, systemd, dashboards, databases, strategies, trading, wallet logic, private keys, API keys, or private endpoints. + +## Inputs + +The collector reads active BTC markets from: + +```text +data/discovery/polymarket_btc_markets_latest.json +``` + +Checkpoint 3 writes normalized market records with `condition_id` and `tokens` preserving the `Up` and `Down` outcome-token mapping. The collector uses only those records and does not perform market discovery itself. + +If the discovery file is stale or contains no usable active markets, run: + +```sh +python3 scripts/discover_polymarket_btc_markets.py +``` + +## Endpoint + +The sample uses the public CLOB batch order-book endpoint: + +```text +POST https://clob.polymarket.com/books +``` + +Request body shape: + +```json +[ + {"token_id": ""}, + {"token_id": ""} +] +``` + +No authentication is used. + +## Running A Bounded Sample + +Default sample command: + +```sh +python3 scripts/collect_polymarket_orderbooks.py +``` + +The default config is: + +```text +config/polymarket_collector.example.yaml +``` + +The example config is deliberately small: + +- `market_limit: 2` +- `interval_seconds: 30` +- `duration_seconds: 300` +- `market_end_safety_seconds: 420` + +This produces a 5-minute sample for at most 2 markets, fetching both `Up` and `Down` outcome tokens by batch request. + +## Outputs + +Raw gzip JSONL snapshots are written under: + +```text +data/live_sample/polymarket/orderbooks// +``` + +The sample manifest is written to: + +```text +data/manifests/orderbook_collector_sample_manifest.json +``` + +Files rotate by run for this checkpoint. Hourly rotation is intentionally left for a later sustained runtime checkpoint. + +## Raw JSONL Envelope + +Each gzip JSONL line is a raw-first envelope: + +```json +{ + "schema_name": "raw_orderbook_snapshot", + "schema_version": 1, + "collector": { + "name": "polymarket_orderbook_collector", + "version": "0.1.0" + }, + "market": { + "market_name": "polymarket", + "market_slug": "example", + "condition_id": "0x...", + "token_id": "123", + "outcome": "Up", + "market_end_time_utc": "2026-04-14T22:00:00Z" + }, + "collection": { + "collected_at_utc": "2026-04-14T21:00:00Z", + "sequence": 1, + "response_index": 0 + }, + "request": { + "method": "POST", + "url": "https://clob.polymarket.com/books", + "params": null, + "json_body": [{"token_id": "123"}], + "status_code": 200, + "duration_ms": 123, + "attempts": [] + }, + "raw": {} +} +``` + +The `raw` object is the unmodified order-book object returned by CLOB for that token. + +## Rate-Limit Handling + +The sample is conservative: + +- Uses a small market cap by default. +- Uses a fixed interval between batch requests. +- Applies request timeout. +- Retries `429` and `5xx` responses with exponential backoff. +- Does not use concurrent requests. + +## Shutdown + +`SIGINT` and `SIGTERM` set a stop flag. The current request, if any, finishes or times out, the gzip file closes, and the manifest is written with a shutdown warning. + +## Known Gaps + +- This is a short run-rotated sample, not a daemon. +- It does not prove 24/7 reliability. +- It does not implement hourly rotation. +- It does not refresh discovery during a run. +- It does not normalize snapshots. +- It does not upload files. +- It does not use websockets. + +The project must not claim production readiness until the later 24h soak test passes with documented quality metrics. diff --git a/docs/PRODUCTION_DEFINITION_OF_DONE.md b/docs/PRODUCTION_DEFINITION_OF_DONE.md new file mode 100644 index 0000000..583e340 --- /dev/null +++ b/docs/PRODUCTION_DEFINITION_OF_DONE.md @@ -0,0 +1,54 @@ +# Production Definition Of Done + +Status: ACTIVE + +Defined at UTC: 2026-04-17T09:12:02Z + +This project is done for the first production milestone only when it is reliably +collecting Polymarket BTC order-book data on a small VPS with evidence on disk. +Packaging, docs, local samples, and local soak tests are useful evidence, but +not the finish line. + +## Done Means + +1. The collector runs on the VPS under systemd using `/opt/orderbooks` for code + and `/var/lib/orderbooks` for data. +2. Raw gzip JSONL order-book snapshots are written for active BTC up/down + markets, with manifests beside them. +3. The service survives a forced restart: after restart, a later collection + cycle writes valid raw rows without corrupting prior files. +4. Temporary network/API failure is handled as an operational failure, not data + loss: failures are visible in logs/manifests, and the next successful cycle + resumes writing new files. +5. Google Drive upload runs from the VPS through `rclone`, verifies success, and + leaves local files in place until upload is confirmed. +6. A final production report and machine-readable manifest record exact commands, + timestamps, files, checksums, restart result, upload result, and remaining + risks. + +## Not Required For This Milestone + +- No second market. +- No dashboard. +- No database. +- No strategy or backtest code. +- No websocket rewrite unless polling proves insufficient. +- No generic multi-market abstraction. + +## Maximum Remaining Builder Turns + +The remaining work is capped at three builder turns: + +1. Accept deploy bundle and prepare the minimal VPS reliability gate. +2. Execute or guide the VPS cutover and collect runtime evidence. +3. Fix only blocking production issues found by the VPS gate, then write the + final pass/fail report. + +If actual VPS access is unavailable, the gate must be `BLOCKED_NEEDS_VPS_ACCESS`, +not production ready. + +## Current Evidence + +- Deploy bundle gate: `DEPLOY_BUNDLE_READY`. +- Local 24h soak final manifest exists but remains `NEEDS_REVIEW`. +- Production readiness remains false until VPS runtime evidence exists. diff --git a/docs/VPS_CUTOVER_RUNBOOK.md b/docs/VPS_CUTOVER_RUNBOOK.md new file mode 100644 index 0000000..59d66c8 --- /dev/null +++ b/docs/VPS_CUTOVER_RUNBOOK.md @@ -0,0 +1,341 @@ +# VPS Cutover Runbook + +Status: valid + +Checkpoint 8 status is `WAIVED_BY_USER`, not `PASS`. This runbook prepares a +VPS cutover for the existing Polymarket raw order-book collector only. It does +not claim production readiness, second-market support, dashboards, databases, +strategies, or trading. + +## Scope + +Included: + +- VPS prerequisite checks. +- Repository copy/update steps. +- Public Polymarket collector service install. +- Google Drive offload timer install with rclone. +- Liveness, cycle health, and upload verification commands. +- Rollback and stop commands. + +Excluded: + +- Private API access. +- Wallets, keys, mnemonics, signing, order placement, or trading. +- Database, dashboard, strategy, or second-market work. + +## Recommended VPS Layout + +Use the existing package paths unless the VPS has a reason to differ: + +```text +repository: /opt/orderbooks +python virtualenv: /opt/orderbooks/.venv +config: /etc/orderbooks/polymarket_collector.vps.yaml +collector env: /etc/orderbooks/polymarket-orderbook-collector.env +uploader env: /etc/orderbooks/orderbook-uploader.env +data root: /var/lib/orderbooks +raw files: /var/lib/orderbooks/raw_orderbooks +manifests: /var/lib/orderbooks/manifests +discovery: /var/lib/orderbooks/discovery +``` + +The `orderbooks` system user should own `/var/lib/orderbooks`. The repository +under `/opt/orderbooks` can be root-owned and world-readable. + +## VPS Prerequisites + +On Ubuntu or Debian: + +```sh +sudo apt-get update +sudo apt-get install -y git python3 python3-venv rclone +sudo useradd --system --home /var/lib/orderbooks --shell /usr/sbin/nologin orderbooks || true +sudo mkdir -p /opt /etc/orderbooks /var/lib/orderbooks/discovery /var/lib/orderbooks/raw_orderbooks /var/lib/orderbooks/manifests /var/log/orderbooks +sudo chown -R orderbooks:orderbooks /var/lib/orderbooks /var/log/orderbooks +``` + +No API keys, private keys, mnemonics, wallets, or trading credentials are +required by this project. rclone credentials are the only machine-local +credential material expected for Google Drive offload, and they must stay +outside the repository. + +## Copy Or Update The Repository + +First install: + +```sh +cd /opt +sudo git clone orderbooks +``` + +Update an existing checkout: + +```sh +cd /opt/orderbooks +sudo git fetch --all --prune +sudo git pull --ff-only +``` + +Prepare repository permissions and the Python virtualenv: + +```sh +cd /opt/orderbooks +sudo chmod +x scripts/run_polymarket_collector_cycle.sh scripts/upload_archive_rclone.sh scripts/vps_preflight_check.sh scripts/vps_runtime_smoke_check.sh +sudo python3 -m venv .venv +sudo .venv/bin/python -m pip install --upgrade pip +sudo chown -R root:root /opt/orderbooks +sudo chmod -R a+rX /opt/orderbooks +``` + +The current collector scripts use the Python standard library. + +## Configure Public Collector Runtime + +Install the example config, then review it: + +```sh +sudo install -o root -g root -m 0644 /opt/orderbooks/config/polymarket_collector.vps.example.yaml /etc/orderbooks/polymarket_collector.vps.yaml +sudo editor /etc/orderbooks/polymarket_collector.vps.yaml +``` + +Optional collector env overrides: + +```sh +sudo install -o root -g orderbooks -m 0640 /dev/null /etc/orderbooks/polymarket-orderbook-collector.env +sudo editor /etc/orderbooks/polymarket-orderbook-collector.env +``` + +Example values: + +```text +ORDERBOOKS_DATA_DIR=/var/lib/orderbooks +ORDERBOOKS_OUTPUT_DIR=/var/lib/orderbooks/raw_orderbooks +ORDERBOOKS_DISCOVERY_MAX_PAGES=3 +``` + +## Configure Rclone + +Configure rclone as the `orderbooks` user. Do not print or commit +`rclone.conf`. + +```sh +sudo -u orderbooks rclone config +sudo -u orderbooks rclone listremotes +sudo -u orderbooks rclone lsf gdrive: --max-depth 1 +``` + +Create the uploader env file: + +```sh +sudo install -o root -g orderbooks -m 0640 /dev/null /etc/orderbooks/orderbook-uploader.env +sudo editor /etc/orderbooks/orderbook-uploader.env +``` + +Example: + +```text +ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket +ORDERBOOKS_RCLONE_BIN=/usr/bin/rclone +ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600 +``` + +The uploader verifies uploads with `rclone check`. Dry runs do not prove remote +write access. + +## Run VPS Preflight + +Run the preflight before installing or starting services: + +```sh +cd /opt/orderbooks +sudo -u orderbooks /opt/orderbooks/scripts/vps_preflight_check.sh \ + --app-dir /opt/orderbooks \ + --python-bin /opt/orderbooks/.venv/bin/python \ + --rclone-bin /usr/bin/rclone \ + --rclone-remote gdrive:orderbooks/polymarket \ + --data-dir /var/lib/orderbooks \ + --manifest-dir /var/lib/orderbooks/manifests \ + --log-dir /var/log/orderbooks \ + --min-free-gib 5 +``` + +The preflight does not print rclone configuration. It checks repository files, +Python compilation, shell syntax, systemd unit parsing when available, rclone +availability, optional remote readability, target directory writability, disk +space, and the absence of required project secrets. + +## Install Systemd Units + +Install collector and uploader units: + +```sh +sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-collector.service /etc/systemd/system/polymarket-orderbook-collector.service +sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.service /etc/systemd/system/polymarket-orderbook-uploader.service +sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.timer /etc/systemd/system/polymarket-orderbook-uploader.timer +sudo systemctl daemon-reload +sudo systemd-analyze verify /etc/systemd/system/polymarket-orderbook-collector.service /etc/systemd/system/polymarket-orderbook-uploader.service /etc/systemd/system/polymarket-orderbook-uploader.timer +``` + +Enable and start: + +```sh +sudo systemctl enable --now polymarket-orderbook-collector.service +sudo systemctl enable --now polymarket-orderbook-uploader.timer +``` + +Run one uploader cycle immediately after the collector has produced closed raw +files: + +```sh +sudo systemctl start polymarket-orderbook-uploader.service +``` + +Run the minimal runtime reliability smoke gate after both units are installed, +rclone is configured, and at least one closed raw file is older than the +uploader minimum age (default: 600 seconds): + +```sh +sudo /opt/orderbooks/scripts/vps_runtime_smoke_check.sh \ + --app-dir /opt/orderbooks \ + --data-dir /var/lib/orderbooks \ + --raw-dir /var/lib/orderbooks/raw_orderbooks \ + --manifest-dir /var/lib/orderbooks/manifests \ + --collector-service polymarket-orderbook-collector.service \ + --uploader-service polymarket-orderbook-uploader.service \ + --wait-seconds 900 +``` + +This command is the minimal production reliability gate. It records a JSON +evidence manifest under `/var/lib/orderbooks/manifests/`, verifies a valid +collector cycle, forces one collector service restart, verifies the prior raw +gzip file still parses with the same checksum, waits for a later valid cycle, +starts the uploader, and records upload success or failure evidence. Preserve +failed smoke manifests and journal logs for review. + +## Check Liveness + +Collector service: + +```sh +sudo systemctl status polymarket-orderbook-collector.service +sudo journalctl -u polymarket-orderbook-collector.service --since "30 minutes ago" +``` + +Uploader timer and service: + +```sh +sudo systemctl list-timers polymarket-orderbook-uploader.timer +sudo systemctl status polymarket-orderbook-uploader.service +sudo journalctl -u polymarket-orderbook-uploader.service --since "2 hours ago" +``` + +Recent artifacts: + +```sh +find /var/lib/orderbooks/raw_orderbooks -type f -name '*.jsonl.gz' -printf '%TY-%Tm-%TdT%TH:%TM:%TS %s %p\n' | sort | tail +find /var/lib/orderbooks/manifests -type f -name '*.json' -printf '%TY-%Tm-%TdT%TH:%TM:%TS %s %p\n' | sort | tail +``` + +## Check Latest Cycle Health + +Inspect the newest collector manifest: + +```sh +latest_collector="$(find /var/lib/orderbooks/manifests -type f -name 'polymarket_orderbook_collector_*.json' | sort | tail -n 1)" +python3 -m json.tool "$latest_collector" | sed -n '1,180p' +``` + +Minimum healthy signs: + +```text +gate_status: PASS +rows_written: greater than 0 +failure_count: 0 +failures: [] +``` + +Verify the latest raw gzip parses and row count matches its manifest: + +```sh +python3 - "$latest_collector" <<'PY' +import gzip +import json +import sys +from pathlib import Path + +manifest = json.loads(Path(sys.argv[1]).read_text()) +for item in manifest.get("output_files", []): + path = Path(item["path"]) + rows = 0 + with gzip.open(path, "rt", encoding="utf-8") as handle: + for line in handle: + if line.strip(): + json.loads(line) + rows += 1 + print({"path": str(path), "rows": rows, "manifest_rows": item.get("rows"), "matches": rows == item.get("rows")}) +PY +``` + +## Verify Uploads + +Inspect the newest upload manifest: + +```sh +latest_upload="$(find /var/lib/orderbooks/manifests -type f -name 'upload_archive_*.json' | sort | tail -n 1)" +python3 -m json.tool "$latest_upload" | sed -n '1,220p' +``` + +Minimum healthy signs: + +```text +operation_status: UPLOAD_VERIFIED +gate_status: PASS +rclone.copy_exit_code: 0 +rclone.check_exit_code: 0 +counts.uploaded equals counts.verified +``` + +Manual remote spot-check without printing config: + +```sh +sudo -u orderbooks rclone lsf "$ORDERBOOKS_RCLONE_DEST" --max-depth 2 | head +``` + +## Rollback Or Stop + +Stop uploader timer first: + +```sh +sudo systemctl disable --now polymarket-orderbook-uploader.timer +sudo systemctl stop polymarket-orderbook-uploader.service +``` + +Stop collector: + +```sh +sudo systemctl stop polymarket-orderbook-collector.service +``` + +Disable collector if needed: + +```sh +sudo systemctl disable polymarket-orderbook-collector.service +``` + +Preserve `/var/lib/orderbooks` and `/var/lib/orderbooks/manifests` for evidence. +If an artifact is wrong, label it as invalid or deprecated in a sibling note +rather than deleting it. + +## Still Not Production Proven + +Because the domestic 24h soak wait was waived by the user, the following remain +unproven: + +- A completed 24h collector run with reviewed final metrics. +- 24h interaction between collector rotation and uploader timer. +- VPS-specific long-run disk, network, rclone, and systemd behavior. +- Retention cleanup behavior under verified upload load. + +Treat this as cutover preparation. The VPS is not deployed until the commands +are run on the VPS and evidence is written. diff --git a/docs/VPS_DEPLOYMENT.md b/docs/VPS_DEPLOYMENT.md new file mode 100644 index 0000000..e4ac708 --- /dev/null +++ b/docs/VPS_DEPLOYMENT.md @@ -0,0 +1,298 @@ +# VPS Deployment + +Status: valid + +This document covers the Checkpoint 6 systemd runtime package for the raw +Polymarket order-book collector. + +It does not claim production readiness or 24/7 reliability. That remains gated +on the later 24h soak test. + +## Scope + +Included: + +- systemd service for the raw collector cycle +- Python virtualenv setup +- service user and directory permissions +- configurable data directory +- discovery refresh before each collector cycle +- journal-based logs +- safe restart model for finite collector runs + +Excluded: + +- Google Drive offload +- `rclone` +- uploader scripts, services, or timers +- normalization changes +- dashboards +- databases +- strategies or backtests +- trading, order placement, signing, or wallet logic + +Uploader service and timer units are intentionally deferred to Checkpoint 7. + +## Runtime Model + +The systemd service runs: + +```text +/opt/orderbooks/scripts/run_polymarket_collector_cycle.sh +``` + +Each cycle: + +1. Refreshes BTC market discovery into the configured data directory. +2. Runs `scripts/collect_polymarket_orderbooks.py` once. +3. Writes run-rotated raw gzip JSONL files. +4. Writes a per-cycle collector manifest. +5. Exits after the configured finite duration. + +The unit uses `Restart=always`, so systemd starts the next cycle after the prior +cycle exits or fails. + +The example config uses a 300 second collection cycle. This is deliberately +short because current BTC up/down markets are short-lived and the collector +refreshes discovery only before a cycle starts. Do not increase the cycle beyond +the practical market horizon unless the collector later learns to refresh market +selection during a run. + +## Paths + +Default VPS paths: + +| Purpose | Path | +| --- | --- | +| Application checkout | `/opt/orderbooks` | +| Python virtualenv | `/opt/orderbooks/.venv` | +| Service config | `/etc/orderbooks/polymarket_collector.vps.yaml` | +| Optional env override file | `/etc/orderbooks/polymarket-orderbook-collector.env` | +| Data directory | `/var/lib/orderbooks` | +| Discovery artifacts | `/var/lib/orderbooks/discovery` | +| Raw order-book output base | `/var/lib/orderbooks/raw_orderbooks` | +| Per-cycle manifests | `/var/lib/orderbooks/manifests` | + +Adjust these paths if the repository is installed somewhere other than +`/opt/orderbooks`. + +## Environment Variables + +The service defines safe defaults and can load overrides from: + +```text +/etc/orderbooks/polymarket-orderbook-collector.env +``` + +Supported variables: + +| Variable | Default | Meaning | +| --- | --- | --- | +| `ORDERBOOKS_APP_DIR` | `/opt/orderbooks` | Repository checkout path. | +| `ORDERBOOKS_DATA_DIR` | `/var/lib/orderbooks` | Base directory for data files. | +| `ORDERBOOKS_PYTHON` | `/opt/orderbooks/.venv/bin/python` | Python interpreter. | +| `ORDERBOOKS_COLLECTOR_CONFIG` | `/etc/orderbooks/polymarket_collector.vps.yaml` | Collector config path. | +| `ORDERBOOKS_DISCOVERY_DIR` | `$ORDERBOOKS_DATA_DIR/discovery` | Discovery artifact directory. | +| `ORDERBOOKS_OUTPUT_DIR` | `$ORDERBOOKS_DATA_DIR/raw_orderbooks` | Collector output base directory. | +| `ORDERBOOKS_MANIFEST_DIR` | `$ORDERBOOKS_DATA_DIR/manifests` | Per-cycle manifest directory. | +| `ORDERBOOKS_DISCOVERY_LIMIT` | `100` | Gamma event page limit per discovery page. | +| `ORDERBOOKS_DISCOVERY_MAX_PAGES` | `3` | Discovery page cap per cycle. | +| `ORDERBOOKS_DISCOVERY_TIMEOUT` | `15` | Discovery request timeout in seconds. | + +Example override file: + +```text +ORDERBOOKS_DATA_DIR=/var/lib/orderbooks +ORDERBOOKS_DISCOVERY_MAX_PAGES=3 +``` + +No API keys are required for this checkpoint. + +## Install On Ubuntu Or Debian + +Run package and account setup as root or with `sudo`: + +```sh +sudo apt-get update +sudo apt-get install -y git python3 python3-venv +sudo useradd --system --home /var/lib/orderbooks --shell /usr/sbin/nologin orderbooks +sudo mkdir -p /opt /etc/orderbooks /var/lib/orderbooks/discovery /var/lib/orderbooks/raw_orderbooks /var/lib/orderbooks/manifests +``` + +Install or update the repository under `/opt/orderbooks`. One option is: + +```sh +cd /opt +sudo git clone orderbooks +``` + +If the checkout already exists: + +```sh +cd /opt/orderbooks +sudo git pull --ff-only +``` + +Prepare permissions: + +```sh +sudo chown -R root:root /opt/orderbooks +sudo chmod -R a+rX /opt/orderbooks +sudo chmod +x /opt/orderbooks/scripts/run_polymarket_collector_cycle.sh +sudo chown -R orderbooks:orderbooks /var/lib/orderbooks +``` + +Create the virtualenv: + +```sh +cd /opt/orderbooks +sudo python3 -m venv .venv +sudo .venv/bin/python -m pip install --upgrade pip +sudo chown -R root:root .venv +sudo chmod -R a+rX .venv +``` + +The current Checkpoint 6 scripts use only the Python standard library. + +Install the VPS config and service unit: + +```sh +sudo install -o root -g root -m 0644 /opt/orderbooks/config/polymarket_collector.vps.example.yaml /etc/orderbooks/polymarket_collector.vps.yaml +sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-collector.service /etc/systemd/system/polymarket-orderbook-collector.service +``` + +Review `/etc/orderbooks/polymarket_collector.vps.yaml` before starting the +service. The example writes under `/var/lib/orderbooks`. + +Enable and start: + +```sh +sudo systemctl daemon-reload +sudo systemctl enable --now polymarket-orderbook-collector.service +``` + +## Logs And Status + +Use the systemd journal: + +```sh +sudo systemctl status polymarket-orderbook-collector.service +sudo journalctl -u polymarket-orderbook-collector.service -f +``` + +Recent logs without following: + +```sh +sudo journalctl -u polymarket-orderbook-collector.service --since "1 hour ago" +``` + +## Output Files + +Raw gzip JSONL files are written under: + +```text +/var/lib/orderbooks/raw_orderbooks/polymarket/orderbooks// +``` + +Per-cycle manifests are written under: + +```text +/var/lib/orderbooks/manifests/polymarket_orderbook_collector_.json +``` + +Discovery artifacts are refreshed under: + +```text +/var/lib/orderbooks/discovery/ +``` + +## Restart And Stop Behavior + +The unit uses: + +```text +Restart=always +RestartSec=30s +TimeoutStopSec=90s +KillSignal=SIGTERM +KillMode=control-group +``` + +The collector handles `SIGTERM` by finishing or timing out the current request, +closing the gzip output, and writing the manifest. Every cycle writes to a new +run directory, so closed files are not reopened by the next cycle. + +Stop the service with: + +```sh +sudo systemctl stop polymarket-orderbook-collector.service +``` + +Start it again with: + +```sh +sudo systemctl start polymarket-orderbook-collector.service +``` + +## Local Validation Without Starting The Service + +These checks do not require root: + +```sh +python3 -m py_compile scripts/discover_polymarket_btc_markets.py scripts/collect_polymarket_orderbooks.py +bash -n scripts/run_polymarket_collector_cycle.sh +python3 - <<'PY' +from pathlib import Path +from scripts.collect_polymarket_orderbooks import load_flat_yaml +cfg = load_flat_yaml(Path('config/polymarket_collector.vps.example.yaml')) +required = { + 'discovery_path', + 'output_dir', + 'manifest_path', + 'market_limit', + 'interval_seconds', + 'duration_seconds', +} +missing = sorted(required - set(cfg)) +assert not missing, missing +assert cfg['duration_seconds'] > 0 +print('config parse ok') +PY +``` + +If systemd tools are available locally: + +```sh +systemd-analyze verify systemd/polymarket-orderbook-collector.service +``` + +The local machine may not have `/opt/orderbooks` or the `orderbooks` service +user. Treat missing VPS path or user messages as deployment-environment warnings, +not collector syntax failures. + +## Safe Upgrade + +Stop the service, update files, rerun validation, then start the service: + +```sh +sudo systemctl stop polymarket-orderbook-collector.service +cd /opt/orderbooks +sudo git pull --ff-only +sudo .venv/bin/python -m py_compile scripts/discover_polymarket_btc_markets.py scripts/collect_polymarket_orderbooks.py +sudo systemctl daemon-reload +sudo systemctl start polymarket-orderbook-collector.service +``` + +Do not remove existing data files during an upgrade. If a bad artifact is found, +preserve it and label it invalid or deprecated with a replacement path when one +exists. + +## Current Limits + +- This package runs the existing raw collector; it does not add a daemon inside + Python. +- The systemd loop is a restart model around finite collector cycles. +- It does not upload files. +- It does not prove long-run reliability. +- Production readiness remains blocked until discovery, raw collection, offload, + and a documented 24h soak test all pass. diff --git a/scripts/build_vps_deploy_bundle.sh b/scripts/build_vps_deploy_bundle.sh new file mode 100755 index 0000000..b0a9cab --- /dev/null +++ b/scripts/build_vps_deploy_bundle.sh @@ -0,0 +1,366 @@ +#!/usr/bin/env bash +set -euo pipefail + +APP_DIR="${ORDERBOOKS_APP_DIR:-$(pwd)}" +OUTPUT_DIR="${ORDERBOOKS_VPS_BUNDLE_OUTPUT_DIR:-artifacts/vps}" +TIMESTAMP="${ORDERBOOKS_VPS_BUNDLE_TIMESTAMP:-$(date -u +%Y%m%dT%H%M%SZ)}" +BUNDLE_BASENAME="orderbooks_vps_deploy_${TIMESTAMP}" +TARBALL="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}.tar.gz" +MANIFEST="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}_manifest.json" + +usage() { + cat <<'EOF' +Usage: scripts/build_vps_deploy_bundle.sh [options] + +Build a deployable VPS bundle from the current working tree. The bundle is +intended to be copied to a VPS and unpacked under /opt/orderbooks. + +Options: + --app-dir DIR Source working tree. Default: ORDERBOOKS_APP_DIR or current directory. + --output-dir DIR Bundle output directory. Default: artifacts/vps. + --timestamp TS Override UTC timestamp used in artifact names. + --help Show this help. + +The bundle uses a narrow allowlist and excludes live data, caches, git metadata, +virtualenvs, rclone config, private keys, wallets, mnemonics, and generated +artifacts. It does not print secrets and does not write Python bytecode. +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --app-dir) + APP_DIR="$2" + shift 2 + ;; + --output-dir) + OUTPUT_DIR="$2" + TARBALL="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}.tar.gz" + MANIFEST="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}_manifest.json" + shift 2 + ;; + --timestamp) + TIMESTAMP="$2" + BUNDLE_BASENAME="orderbooks_vps_deploy_${TIMESTAMP}" + TARBALL="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}.tar.gz" + MANIFEST="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}_manifest.json" + shift 2 + ;; + --help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 2 + ;; + esac +done + +APP_DIR="${APP_DIR%/}" +if [[ ! -d "${APP_DIR}" ]]; then + echo "Source app directory does not exist: ${APP_DIR}" >&2 + exit 1 +fi + +mkdir -p "${OUTPUT_DIR}" +cd "${APP_DIR}" + +if [[ -e "${TARBALL}" || -e "${MANIFEST}" ]]; then + echo "Refusing to overwrite existing bundle artifact: ${TARBALL} or ${MANIFEST}" >&2 + exit 1 +fi + +FILELIST="$(mktemp)" +trap 'rm -f "${FILELIST}"' EXIT + +PYTHONDONTWRITEBYTECODE=1 python3 - "${FILELIST}" "${MANIFEST}" "${TARBALL}" "${TIMESTAMP}" <<'PY_BUNDLE_SELECT' +import datetime as dt +import fnmatch +import hashlib +import json +import os +import sys +from pathlib import Path + +filelist_path = Path(sys.argv[1]) +manifest_path = Path(sys.argv[2]) +tarball_path = Path(sys.argv[3]) +timestamp = sys.argv[4] +root = Path.cwd() + +allowed_files = [ + Path("AGENTS.md"), + Path("ROADMAP.md"), +] +allowed_dirs = [ + Path("config"), + Path("docs"), + Path("scripts"), + Path("systemd"), + Path("reports/checkpoints"), +] +allowed_globs = [ + "data/manifests/checkpoint_*.json", +] +excluded_patterns = [ + ".git/", + ".venv/", + "artifacts/", + "data/soak_test/", + "data/live_sample/", + "data/normalized_sample/", + "**/__pycache__/", + "**/*.pyc", + "**/*.pyo", + "**/.pytest_cache/", + "**/.mypy_cache/", + "**/.ruff_cache/", + "**/rclone.conf", + "**/.env", + "**/*.pem", + "**/*.key", + "**/*.p12", + "**/*.pfx", + "**/id_rsa*", + "**/id_ed25519*", + "**/*mnemonic*", + "**/*wallet*", + "**/*credential*", + "**/*secret*", +] +required_files = [ + "AGENTS.md", + "ROADMAP.md", + "config/polymarket_collector.vps.example.yaml", + "config/rclone.example.md", + "docs/VPS_CUTOVER_RUNBOOK.md", + "docs/VPS_DEPLOYMENT.md", + "docs/GOOGLE_DRIVE_OFFLOAD.md", + "scripts/build_vps_deploy_bundle.sh", + "scripts/vps_preflight_check.sh", + "scripts/vps_runtime_smoke_check.sh", + "scripts/run_polymarket_collector_cycle.sh", + "scripts/upload_archive_rclone.sh", + "scripts/discover_polymarket_btc_markets.py", + "scripts/collect_polymarket_orderbooks.py", + "scripts/normalize_polymarket_orderbooks.py", + "systemd/polymarket-orderbook-collector.service", + "systemd/polymarket-orderbook-uploader.service", + "systemd/polymarket-orderbook-uploader.timer", +] + +forbidden_path_fragments = [ + "/.git/", + "/.venv/", + "/__pycache__/", + "/data/soak_test/", + "/data/live_sample/", + "/data/normalized_sample/", + "/artifacts/", +] +forbidden_names = { + "rclone.conf", + ".env", + "id_rsa", + "id_ed25519", +} +forbidden_suffixes = { + ".pyc", + ".pyo", + ".pem", + ".key", + ".p12", + ".pfx", +} +secretish_name_tokens = [ + "mnemonic", + "wallet", + "credential", + "secret", +] + +def as_posix(path: Path) -> str: + return path.as_posix() + +def is_forbidden(path: Path) -> tuple[bool, str | None]: + rel = as_posix(path) + wrapped = f"/{rel}/" if path.is_dir() else f"/{rel}" + if path.is_absolute() or ".." in path.parts: + return True, "absolute_or_parent_path" + for fragment in forbidden_path_fragments: + if fragment in wrapped: + return True, f"forbidden_fragment:{fragment}" + if any(part in {".git", ".venv", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache"} for part in path.parts): + return True, "forbidden_cache_or_metadata_dir" + lower_name = path.name.lower() + if lower_name in forbidden_names: + return True, f"forbidden_name:{path.name}" + if path.suffix.lower() in forbidden_suffixes: + return True, f"forbidden_suffix:{path.suffix}" + if any(token in lower_name for token in secretish_name_tokens): + return True, f"secretish_name:{path.name}" + if rel.startswith(("data/soak_test/", "data/live_sample/", "data/normalized_sample/", "artifacts/")): + return True, "forbidden_prefix" + return False, None + +def iter_allowed_files(): + seen = set() + for path in allowed_files: + if path.is_file() and path not in seen: + seen.add(path) + yield path + for directory in allowed_dirs: + if not directory.exists(): + continue + for path in sorted(directory.rglob("*")): + if path.is_file() and path not in seen: + seen.add(path) + yield path + for pattern in allowed_globs: + for path in sorted(root.glob(pattern)): + if path.is_file() and path not in seen: + seen.add(path) + yield path + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + +included = [] +excluded = [] +for path in iter_allowed_files(): + forbidden, reason = is_forbidden(path) + if forbidden: + excluded.append({"path": as_posix(path), "reason": reason}) + continue + stat = path.stat() + included.append({ + "path": as_posix(path), + "bytes": stat.st_size, + "sha256": sha256_file(path), + }) + +included_paths = sorted(item["path"] for item in included) +missing_required = sorted(path for path in required_files if path not in included_paths) +if missing_required: + raise SystemExit(f"missing required bundle files: {missing_required}") +if not included: + raise SystemExit("bundle file list is empty") + +filelist_path.write_bytes(b"".join(path.encode("utf-8") + b"\0" for path in included_paths)) +created_at = dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") +manifest = { + "schema_name": "vps_deploy_bundle_manifest", + "schema_version": 1, + "created_at_utc": created_at, + "timestamp": timestamp, + "tarball_path": as_posix(tarball_path), + "manifest_path": as_posix(manifest_path), + "source_root": str(root), + "bundle_intent": "Copy to a VPS and unpack under /opt/orderbooks; VPS execution remains pending.", + "production_ready": False, + "vps_deployed": False, + "included_roots": [str(path) for path in allowed_files + allowed_dirs] + allowed_globs, + "excluded_patterns": excluded_patterns, + "required_files": required_files, + "included_file_count": len(included), + "included_files": included, + "excluded_selected_files": excluded, + "missing_required_files": missing_required, + "validation": { + "required_files_present_before_tar": not missing_required, + "forbidden_paths_absent_before_tar": True, + "tarball_validation_completed": False, + }, +} +manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") +PY_BUNDLE_SELECT + +tar --create --gzip --file "${TARBALL}" --null --files-from "${FILELIST}" --owner=0 --group=0 --numeric-owner + +PYTHONDONTWRITEBYTECODE=1 python3 - "${TARBALL}" "${MANIFEST}" <<'PY_BUNDLE_VALIDATE' +import hashlib +import json +import sys +import tarfile +from pathlib import Path + +tarball_path = Path(sys.argv[1]) +manifest_path = Path(sys.argv[2]) +manifest = json.loads(manifest_path.read_text(encoding="utf-8")) +required_files = set(manifest["required_files"]) + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + +def forbidden_reason(name: str) -> str | None: + parts = name.split("/") + lower_name = parts[-1].lower() + if name.startswith("/") or any(part == ".." for part in parts): + return "absolute_or_parent_path" + if parts[0] in {".git", ".venv", "artifacts"}: + return f"forbidden_top_level:{parts[0]}" + if len(parts) >= 2 and parts[0] == "data" and parts[1] in {"soak_test", "live_sample", "normalized_sample"}: + return f"forbidden_data_dir:data/{parts[1]}" + if any(part in {".git", ".venv", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache"} for part in parts): + return "forbidden_cache_or_metadata_dir" + if lower_name in {"rclone.conf", ".env", "id_rsa", "id_ed25519"}: + return f"forbidden_name:{lower_name}" + if any(lower_name.endswith(suffix) for suffix in (".pyc", ".pyo", ".pem", ".key", ".p12", ".pfx")): + return "forbidden_suffix" + if any(token in lower_name for token in ("mnemonic", "wallet", "credential", "secret")): + return "secretish_name" + return None + +with tarfile.open(tarball_path, "r:gz") as archive: + members = [member for member in archive.getmembers() if member.isfile()] + names = sorted(member.name for member in members) + +forbidden = [{"path": name, "reason": forbidden_reason(name)} for name in names if forbidden_reason(name)] +missing_required = sorted(required_files - set(names)) +if forbidden or missing_required: + manifest["validation"].update({ + "tarball_validation_completed": True, + "forbidden_paths_absent_in_tarball": not forbidden, + "required_files_present_in_tarball": not missing_required, + "forbidden_paths_in_tarball": forbidden, + "missing_required_files_in_tarball": missing_required, + }) + manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") + raise SystemExit(f"bundle validation failed forbidden={forbidden} missing_required={missing_required}") + +manifest["tarball_bytes"] = tarball_path.stat().st_size +manifest["tarball_sha256"] = sha256_file(tarball_path) +manifest["tarball_content_count"] = len(names) +manifest["tarball_contents"] = names +manifest["validation"].update({ + "tarball_validation_completed": True, + "forbidden_paths_absent_in_tarball": True, + "required_files_present_in_tarball": True, + "forbidden_paths_in_tarball": [], + "missing_required_files_in_tarball": [], +}) +manifest["gate_status"] = "PASS" +manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") +PY_BUNDLE_VALIDATE + +printf 'BUNDLE_TARBALL=%s\n' "${TARBALL}" +printf 'BUNDLE_MANIFEST=%s\n' "${MANIFEST}" +python3 - <<'PY_PRINT' "${MANIFEST}" +import json +import sys +from pathlib import Path +m = json.loads(Path(sys.argv[1]).read_text(encoding="utf-8")) +print(f"BUNDLE_SHA256={m['tarball_sha256']}") +print(f"BUNDLE_BYTES={m['tarball_bytes']}") +print(f"BUNDLE_FILE_COUNT={m['tarball_content_count']}") +PY_PRINT diff --git a/scripts/collect_polymarket_orderbooks.py b/scripts/collect_polymarket_orderbooks.py new file mode 100755 index 0000000..c37727c --- /dev/null +++ b/scripts/collect_polymarket_orderbooks.py @@ -0,0 +1,668 @@ +#!/usr/bin/env python3 +"""Minimal raw Polymarket order-book snapshot sample collector. + +Checkpoint 4 scope: finite sample run only. This script reads the BTC discovery +artifact, fetches public CLOB batch order books for a small market set, writes +raw gzip JSONL envelopes, and closes with a manifest. It is not a daemon and it +does not trade. +""" + +from __future__ import annotations + +import argparse +import datetime as dt +import gzip +import hashlib +import json +import signal +import sys +import time +import urllib.error +import urllib.request +from pathlib import Path +from typing import Any + + +COLLECTOR_NAME = "polymarket_orderbook_collector" +COLLECTOR_VERSION = "0.1.0" +SCHEMA_NAME = "raw_orderbook_snapshot" +SCHEMA_VERSION = 1 +CLOB_BOOKS_URL = "https://clob.polymarket.com/books" + +DEFAULT_CONFIG_PATH = Path("config/polymarket_collector.example.yaml") +DEFAULT_DISCOVERY_PATH = Path("data/discovery/polymarket_btc_markets_latest.json") +DEFAULT_OUTPUT_DIR = Path("data/live_sample") +DEFAULT_MANIFEST_PATH = Path("data/manifests/orderbook_collector_sample_manifest.json") + +SAFE_RESPONSE_HEADERS = { + "cache-control", + "cf-cache-status", + "cf-ray", + "content-length", + "content-type", + "date", + "retry-after", + "server", + "x-ratelimit-limit", + "x-ratelimit-remaining", + "x-ratelimit-reset", + "ratelimit-limit", + "ratelimit-remaining", + "ratelimit-reset", +} + +STOP_REQUESTED = False +STOP_SIGNAL: str | None = None + + +def handle_stop(signum: int, _frame: Any) -> None: + global STOP_REQUESTED, STOP_SIGNAL + STOP_REQUESTED = True + STOP_SIGNAL = signal.Signals(signum).name + + +def utc_now() -> dt.datetime: + return dt.datetime.now(dt.UTC) + + +def iso_z(value: dt.datetime | None = None) -> str: + value = value or utc_now() + return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def compact_timestamp(value: dt.datetime | None = None) -> str: + value = value or utc_now() + return value.astimezone(dt.UTC).strftime("%Y%m%dT%H%M%SZ") + + +def parse_iso(value: Any) -> dt.datetime | None: + if not isinstance(value, str) or not value.strip(): + return None + text = value.strip() + if text.endswith("Z"): + text = text[:-1] + "+00:00" + try: + parsed = dt.datetime.fromisoformat(text) + except ValueError: + return None + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=dt.UTC) + return parsed.astimezone(dt.UTC) + + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def parse_scalar(value: str) -> Any: + value = value.strip() + if not value: + return "" + if value[0] in {"'", '"'} and value[-1:] == value[0]: + return value[1:-1] + lower = value.lower() + if lower in {"true", "false"}: + return lower == "true" + if lower in {"null", "none"}: + return None + try: + return int(value) + except ValueError: + pass + try: + return float(value) + except ValueError: + return value + + +def load_flat_yaml(path: Path) -> dict[str, Any]: + """Parse the flat YAML subset used by the example config.""" + config: dict[str, Any] = {} + if not path.exists(): + return config + for line_number, raw_line in enumerate(path.read_text(encoding="utf-8").splitlines(), 1): + line = raw_line.split("#", 1)[0].strip() + if not line: + continue + if ":" not in line: + raise ValueError(f"Unsupported config line {line_number}: {raw_line}") + key, value = line.split(":", 1) + key = key.strip() + if not key: + raise ValueError(f"Missing config key on line {line_number}") + config[key] = parse_scalar(value) + return config + + +def config_digest(path: Path | None) -> str | None: + if path is None or not path.exists(): + return None + return sha256_file(path) + + +def filter_headers(headers: Any) -> dict[str, str]: + safe: dict[str, str] = {} + for key, value in dict(headers).items(): + if key.lower() in SAFE_RESPONSE_HEADERS: + safe[key] = value + return safe + + +def http_post_json( + *, + url: str, + json_body: Any, + timeout_seconds: float, + max_retries: int, + backoff_seconds: float, +) -> dict[str, Any]: + body_bytes = json.dumps(json_body, separators=(",", ":")).encode("utf-8") + attempts: list[dict[str, Any]] = [] + final_json: Any | None = None + final_text_preview: str | None = None + final_json_error: str | None = None + final_status_code: int | None = None + final_headers: dict[str, str] = {} + + for attempt_index in range(max_retries + 1): + started_at = iso_z() + started_monotonic = time.monotonic() + status_code: int | None = None + response_headers: dict[str, str] = {} + response_text = "" + error: str | None = None + try: + request = urllib.request.Request( + url, + data=body_bytes, + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + "User-Agent": "orderbooks-checkpoint-4-sample/0.1.0", + }, + method="POST", + ) + with urllib.request.urlopen(request, timeout=timeout_seconds) as response: + status_code = response.status + response_headers = filter_headers(response.headers) + response_text = response.read().decode("utf-8", errors="replace") + except urllib.error.HTTPError as exc: + status_code = exc.code + response_headers = filter_headers(exc.headers) + response_text = exc.read().decode("utf-8", errors="replace") + error = f"HTTPError: {exc}" + except Exception as exc: # noqa: BLE001 - preserve request failure evidence + error = f"{type(exc).__name__}: {exc}" + + duration_ms = round((time.monotonic() - started_monotonic) * 1000, 3) + parsed_json = None + json_error = None + if response_text: + try: + parsed_json = json.loads(response_text) + except json.JSONDecodeError as exc: + json_error = str(exc) + + attempts.append( + { + "attempt": attempt_index + 1, + "started_at_utc": started_at, + "ended_at_utc": iso_z(), + "duration_ms": duration_ms, + "status_code": status_code, + "headers": response_headers, + "error": error, + "json_error": json_error, + } + ) + final_json = parsed_json + final_json_error = json_error + final_text_preview = response_text[:1000] if parsed_json is None else None + final_status_code = status_code + final_headers = response_headers + + retryable = status_code == 429 or (status_code is not None and 500 <= status_code <= 599) + if error is None and status_code is not None and 200 <= status_code < 300: + break + if not retryable or attempt_index >= max_retries or STOP_REQUESTED: + break + retry_after = response_headers.get("Retry-After") or response_headers.get("retry-after") + sleep_seconds = backoff_seconds * (2**attempt_index) + if retry_after: + try: + sleep_seconds = max(sleep_seconds, float(retry_after)) + except ValueError: + pass + time.sleep(sleep_seconds) + + return { + "request": { + "method": "POST", + "url": url, + "json_body": json_body, + }, + "response": { + "status_code": final_status_code, + "headers": final_headers, + "json": final_json, + "json_error": final_json_error, + "text_preview": final_text_preview, + }, + "attempts": attempts, + "duration_ms": round(sum(attempt["duration_ms"] for attempt in attempts), 3), + "ok": final_status_code is not None and 200 <= final_status_code < 300 and final_json_error is None, + } + + +def load_discovery(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def market_is_usable(market: dict[str, Any], now: dt.datetime, safety_seconds: int) -> tuple[bool, list[str]]: + reasons: list[str] = [] + if market.get("active") is not True: + reasons.append("not_active") + if market.get("closed") is not False: + reasons.append("closed") + if market.get("accepting_orders") is not True: + reasons.append("not_accepting_orders") + if market.get("enable_order_book") is not True: + reasons.append("order_book_not_enabled") + end_time = parse_iso(market.get("end_time_utc")) + if end_time is None: + reasons.append("missing_end_time") + elif end_time <= now + dt.timedelta(seconds=safety_seconds): + reasons.append("too_close_to_end_or_expired") + tokens = market.get("tokens") + if not isinstance(tokens, list) or len(tokens) < 2: + reasons.append("missing_two_tokens") + else: + outcomes = [token.get("outcome") for token in tokens if isinstance(token, dict)] + token_ids = [token.get("token_id") for token in tokens if isinstance(token, dict)] + if outcomes[:2] != ["Up", "Down"] or not all(token_ids[:2]): + reasons.append("bad_up_down_token_mapping") + return not reasons, reasons + + +def select_markets( + discovery: dict[str, Any], + *, + market_limit: int, + market_end_safety_seconds: int, +) -> tuple[list[dict[str, Any]], dict[str, int]]: + now = utc_now() + selected: list[dict[str, Any]] = [] + rejection_counts: dict[str, int] = {} + markets = discovery.get("normalized_markets") or [] + for market in markets: + if not isinstance(market, dict): + rejection_counts["not_object"] = rejection_counts.get("not_object", 0) + 1 + continue + usable, reasons = market_is_usable(market, now, market_end_safety_seconds) + if not usable: + for reason in reasons: + rejection_counts[reason] = rejection_counts.get(reason, 0) + 1 + continue + selected.append(market) + if len(selected) >= market_limit: + break + return selected, dict(sorted(rejection_counts.items())) + + +def flatten_tokens(markets: list[dict[str, Any]]) -> list[dict[str, Any]]: + tokens: list[dict[str, Any]] = [] + for market in markets: + for token in market.get("tokens", [])[:2]: + tokens.append( + { + "market_name": market.get("market_name"), + "market_slug": market.get("market_slug"), + "condition_id": market.get("condition_id"), + "token_id": str(token.get("token_id")), + "outcome": token.get("outcome"), + "market_end_time_utc": market.get("end_time_utc"), + } + ) + return tokens + + +def build_snapshot_envelope( + *, + raw_book: dict[str, Any], + token_meta: dict[str, Any], + collected_at_utc: str, + sequence: int, + request_record: dict[str, Any], + response_index: int, +) -> dict[str, Any]: + return { + "schema_name": SCHEMA_NAME, + "schema_version": SCHEMA_VERSION, + "collector": { + "name": COLLECTOR_NAME, + "version": COLLECTOR_VERSION, + }, + "market": { + "market_name": token_meta.get("market_name"), + "market_slug": token_meta.get("market_slug"), + "condition_id": token_meta.get("condition_id"), + "token_id": token_meta.get("token_id"), + "outcome": token_meta.get("outcome"), + "market_end_time_utc": token_meta.get("market_end_time_utc"), + }, + "collection": { + "collected_at_utc": collected_at_utc, + "sequence": sequence, + "response_index": response_index, + }, + "request": { + "method": request_record["request"]["method"], + "url": request_record["request"]["url"], + "params": None, + "json_body": request_record["request"]["json_body"], + "status_code": request_record["response"]["status_code"], + "duration_ms": request_record["duration_ms"], + "attempts": request_record["attempts"], + }, + "raw": raw_book, + } + + +def summarize_output_file(path: Path, rows_written: int) -> dict[str, Any]: + return { + "path": path.as_posix(), + "status": "valid" if path.exists() and path.stat().st_size > 0 else "missing", + "bytes": path.stat().st_size if path.exists() else 0, + "rows": rows_written, + "sha256": sha256_file(path) if path.exists() else None, + } + + +def write_manifest(path: Path, manifest: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def config_value(config: dict[str, Any], args: argparse.Namespace, key: str, default: Any) -> Any: + cli_value = getattr(args, key) + if cli_value is not None: + return cli_value + return config.get(key, default) + + +def build_runtime_config(args: argparse.Namespace) -> dict[str, Any]: + config_path = args.config + file_config = load_flat_yaml(config_path) if config_path else {} + runtime = { + "discovery_path": Path(config_value(file_config, args, "discovery_path", DEFAULT_DISCOVERY_PATH)), + "output_dir": Path(config_value(file_config, args, "output_dir", DEFAULT_OUTPUT_DIR)), + "manifest_path": Path(config_value(file_config, args, "manifest_path", DEFAULT_MANIFEST_PATH)), + "market_limit": int(config_value(file_config, args, "market_limit", 2)), + "interval_seconds": float(config_value(file_config, args, "interval_seconds", 30.0)), + "duration_seconds": float(config_value(file_config, args, "duration_seconds", 300.0)), + "request_timeout_seconds": float(config_value(file_config, args, "request_timeout_seconds", 15.0)), + "max_retries": int(config_value(file_config, args, "max_retries", 2)), + "backoff_seconds": float(config_value(file_config, args, "backoff_seconds", 2.0)), + "market_end_safety_seconds": int(config_value(file_config, args, "market_end_safety_seconds", 420)), + "clob_books_url": str(config_value(file_config, args, "clob_books_url", CLOB_BOOKS_URL)), + "config_path": config_path, + "config_sha256": config_digest(config_path), + "config_snapshot": file_config, + } + if runtime["market_limit"] < 1: + raise ValueError("market_limit must be >= 1") + if runtime["interval_seconds"] <= 0: + raise ValueError("interval_seconds must be > 0") + if runtime["duration_seconds"] <= 0: + raise ValueError("duration_seconds must be > 0") + return runtime + + +def run_collection(runtime: dict[str, Any], command: str) -> tuple[dict[str, Any], Path]: + signal.signal(signal.SIGINT, handle_stop) + signal.signal(signal.SIGTERM, handle_stop) + + started = utc_now() + started_at_utc = iso_z(started) + discovery_path: Path = runtime["discovery_path"] + discovery = load_discovery(discovery_path) + selected_markets, rejection_counts = select_markets( + discovery, + market_limit=runtime["market_limit"], + market_end_safety_seconds=runtime["market_end_safety_seconds"], + ) + warnings: list[str] = [] + failures: list[dict[str, Any]] = [] + if not selected_markets: + warnings.append("No usable active BTC markets found in discovery input.") + + tokens = flatten_tokens(selected_markets) + run_id = compact_timestamp(started) + output_dir = runtime["output_dir"] / "polymarket" / "orderbooks" / run_id + output_dir.mkdir(parents=True, exist_ok=True) + output_file = output_dir / f"polymarket_orderbooks_{run_id}.jsonl.gz" + + request_count = 0 + success_count = 0 + failure_count = 0 + status_code_counts: dict[str, int] = {} + rows_written = 0 + sequence = 0 + token_row_counts = {token["token_id"]: 0 for token in tokens} + + deadline = time.monotonic() + runtime["duration_seconds"] + token_by_id = {token["token_id"]: token for token in tokens} + request_body = [{"token_id": token["token_id"]} for token in tokens] + + with gzip.open(output_file, "wt", encoding="utf-8") as handle: + while tokens and not STOP_REQUESTED and time.monotonic() < deadline: + loop_started = time.monotonic() + collected_at_utc = iso_z() + request_count += 1 + request_record = http_post_json( + url=runtime["clob_books_url"], + json_body=request_body, + timeout_seconds=runtime["request_timeout_seconds"], + max_retries=runtime["max_retries"], + backoff_seconds=runtime["backoff_seconds"], + ) + status_code = request_record["response"]["status_code"] + status_key = str(status_code) + status_code_counts[status_key] = status_code_counts.get(status_key, 0) + 1 + if request_record["ok"] and isinstance(request_record["response"]["json"], list): + success_count += 1 + for response_index, raw_book in enumerate(request_record["response"]["json"]): + if not isinstance(raw_book, dict): + failure_count += 1 + failures.append( + { + "collected_at_utc": collected_at_utc, + "reason": "book_response_item_not_object", + "response_index": response_index, + } + ) + continue + asset_id = str(raw_book.get("asset_id") or "") + token_meta = token_by_id.get(asset_id) + if token_meta is None: + failure_count += 1 + failures.append( + { + "collected_at_utc": collected_at_utc, + "reason": "unknown_asset_id_in_book_response", + "asset_id": asset_id, + } + ) + continue + sequence += 1 + envelope = build_snapshot_envelope( + raw_book=raw_book, + token_meta=token_meta, + collected_at_utc=collected_at_utc, + sequence=sequence, + request_record=request_record, + response_index=response_index, + ) + handle.write(json.dumps(envelope, separators=(",", ":"), sort_keys=True) + "\n") + rows_written += 1 + token_row_counts[asset_id] = token_row_counts.get(asset_id, 0) + 1 + handle.flush() + else: + failure_count += 1 + failures.append( + { + "collected_at_utc": collected_at_utc, + "reason": "request_failed_or_non_json_list", + "status_code": status_code, + "attempts": request_record["attempts"], + "json_error": request_record["response"]["json_error"], + "text_preview": request_record["response"]["text_preview"], + } + ) + + remaining_interval = runtime["interval_seconds"] - (time.monotonic() - loop_started) + while remaining_interval > 0 and not STOP_REQUESTED and time.monotonic() < deadline: + sleep_for = min(remaining_interval, deadline - time.monotonic(), 1.0) + if sleep_for <= 0: + break + time.sleep(sleep_for) + remaining_interval = runtime["interval_seconds"] - (time.monotonic() - loop_started) + + ended = utc_now() + ended_at_utc = iso_z(ended) + duration_seconds_actual = round((ended - started).total_seconds(), 3) + if STOP_REQUESTED: + warnings.append(f"Graceful shutdown requested by {STOP_SIGNAL}.") + if runtime["duration_seconds"] < 300: + warnings.append("Configured run duration was shorter than the roadmap 5-minute sample target.") + if not failures and request_count > 0: + failures = [] + output_summary = summarize_output_file(output_file, rows_written) + gate_status = "PASS" if rows_written > 0 and all(count > 0 for count in token_row_counts.values()) else "FAIL" + if not tokens: + gate_status = "BLOCKED" + if request_count == 0: + gate_status = "FAIL" if tokens else "BLOCKED" + manifest = { + "schema_name": "orderbook_collector_sample_manifest", + "schema_version": 1, + "checkpoint_id": 4, + "checkpoint_name": "Minimal Orderbook Snapshot Collector", + "gate_status": gate_status, + "collector": { + "name": COLLECTOR_NAME, + "version": COLLECTOR_VERSION, + }, + "started_at_utc": started_at_utc, + "ended_at_utc": ended_at_utc, + "run_duration_seconds": duration_seconds_actual, + "configured_duration_seconds": runtime["duration_seconds"], + "interval_seconds": runtime["interval_seconds"], + "command": command, + "config": { + "path": runtime["config_path"].as_posix() if runtime["config_path"] else None, + "sha256": runtime["config_sha256"], + "snapshot": runtime["config_snapshot"], + "effective": { + "discovery_path": discovery_path.as_posix(), + "output_dir": runtime["output_dir"].as_posix(), + "manifest_path": runtime["manifest_path"].as_posix(), + "market_limit": runtime["market_limit"], + "interval_seconds": runtime["interval_seconds"], + "duration_seconds": runtime["duration_seconds"], + "request_timeout_seconds": runtime["request_timeout_seconds"], + "max_retries": runtime["max_retries"], + "backoff_seconds": runtime["backoff_seconds"], + "market_end_safety_seconds": runtime["market_end_safety_seconds"], + "clob_books_url": runtime["clob_books_url"], + }, + }, + "discovery": { + "path": discovery_path.as_posix(), + "fetched_at_utc": discovery.get("fetched_at_utc"), + "source_summary": discovery.get("summary"), + "rejection_counts_before_selection": rejection_counts, + }, + "markets_tracked": [ + { + "market_name": market.get("market_name"), + "market_slug": market.get("market_slug"), + "condition_id": market.get("condition_id"), + "end_time_utc": market.get("end_time_utc"), + } + for market in selected_markets + ], + "tokens_tracked": tokens, + "request_count": request_count, + "success_count": success_count, + "failure_count": failure_count, + "status_code_counts": dict(sorted(status_code_counts.items())), + "rows_written": rows_written, + "token_row_counts": token_row_counts, + "output_files": [output_summary], + "failures": failures, + "warnings": warnings, + "known_gaps": [ + "This is a short run-rotated sample, not a daemon.", + "Hourly rotation is documented but not implemented in this checkpoint.", + "No websocket capture, normalization, upload, systemd unit, dashboard, database, or trading behavior is included.", + "A 5-minute sample proves file-writing behavior only; it does not prove 24/7 reliability.", + ], + "fake_progress_risk": "A small successful sample can still hide long-run gaps, stale discovery, endpoint schema drift, and missed intervals. Reliability remains gated on the future 24h soak test.", + "next_step": "Checkpoint 5 should normalize this raw sample while preserving raw file references, or rerun a fresh short sample if the orchestrator wants more raw evidence first.", + } + return manifest, output_file + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Collect a bounded raw gzip JSONL sample of Polymarket BTC order books." + ) + parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH) + parser.add_argument("--discovery-path", type=Path, default=None) + parser.add_argument("--output-dir", type=Path, default=None) + parser.add_argument("--manifest-path", type=Path, default=None) + parser.add_argument("--market-limit", type=int, default=None) + parser.add_argument("--interval-seconds", type=float, default=None) + parser.add_argument("--duration-seconds", type=float, default=None) + parser.add_argument("--request-timeout-seconds", type=float, default=None) + parser.add_argument("--max-retries", type=int, default=None) + parser.add_argument("--backoff-seconds", type=float, default=None) + parser.add_argument("--market-end-safety-seconds", type=int, default=None) + parser.add_argument("--clob-books-url", type=str, default=None) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + command = " ".join([Path(sys.argv[0]).as_posix(), *sys.argv[1:]]) + runtime = build_runtime_config(args) + manifest, output_file = run_collection(runtime, command) + write_manifest(runtime["manifest_path"], manifest) + print( + json.dumps( + { + "gate_status": manifest["gate_status"], + "manifest_path": runtime["manifest_path"].as_posix(), + "output_file": output_file.as_posix(), + "markets_tracked": manifest["markets_tracked"], + "tokens_tracked": len(manifest["tokens_tracked"]), + "request_count": manifest["request_count"], + "success_count": manifest["success_count"], + "failure_count": manifest["failure_count"], + "rows_written": manifest["rows_written"], + "warnings": manifest["warnings"], + }, + indent=2, + sort_keys=True, + ) + ) + return 0 if manifest["gate_status"] == "PASS" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/deploy/bootstrap_orderbooks_k8s.sh b/scripts/deploy/bootstrap_orderbooks_k8s.sh new file mode 100755 index 0000000..a807022 --- /dev/null +++ b/scripts/deploy/bootstrap_orderbooks_k8s.sh @@ -0,0 +1,146 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "$0")/../.." && pwd)" +PLATFORM_REPO_DIR="${PLATFORM_REPO_DIR:-/home/philipp/dev/ae/nuri/unrip3}" +PLATFORM_ENV_FILE="${PLATFORM_ENV_FILE:-$PLATFORM_REPO_DIR/scripts/hetzner/bootstrap-secrets.env}" +PLATFORM_RESOLVED_ENV_FILE="${PLATFORM_RESOLVED_ENV_FILE:-$PLATFORM_REPO_DIR/.state/hetzner/bootstrap-secrets.resolved.env}" +KUBECONFIG_PATH="${KUBECONFIG_PATH:-$PLATFORM_REPO_DIR/.state/hetzner/kubeconfig.yaml}" +CI_KUBECONFIG_PATH="${CI_KUBECONFIG_PATH:-$PLATFORM_REPO_DIR/.state/hetzner/kubeconfig.incluster.yaml}" + +PROJECT_NAME="${PROJECT_NAME:-orderbooks}" +PROJECT_NAMESPACE="${PROJECT_NAMESPACE:-orderbooks}" +PROJECT_DEPLOYMENTS="${PROJECT_DEPLOYMENTS:-orderbooks-collector}" +PROJECT_REGISTRY_SECRET_NAME="${PROJECT_REGISTRY_SECRET_NAME:-orderbooks-registry-creds}" +RCLONE_SECRET_NAME="${RCLONE_SECRET_NAME:-orderbooks-rclone-config}" +RCLONE_SECRET_KEY="${RCLONE_SECRET_KEY:-rclone.conf}" +FORGEJO_REPO_OWNER="${FORGEJO_REPO_OWNER:-philipp}" +FORGEJO_REPO_NAME="${FORGEJO_REPO_NAME:-orderbooks}" +FORGEJO_REPO_PRIVATE="${FORGEJO_REPO_PRIVATE:-0}" + +require() { + command -v "$1" >/dev/null 2>&1 || { + echo "missing required command: $1" >&2 + exit 1 + } +} + +load_env_defaults() { + local file="$1" + [[ -f "$file" ]] || return 0 + eval "$( + python3 - "$file" <<'PY_LOAD_ENV' +import os +import shlex +import sys + +for raw in open(sys.argv[1], 'r', encoding='utf-8'): + line = raw.strip() + if not line or line.startswith('#'): + continue + if line.startswith('export '): + line = line[len('export '):] + if '=' not in line: + continue + key, value = line.split('=', 1) + key = key.strip() + value = value.strip() + if len(value) >= 2 and value[0] == value[-1] and value[0] in {'\"', "'"}: + value = value[1:-1] + if key in os.environ: + continue + print(f'export {key}={shlex.quote(value)}') +PY_LOAD_ENV + )" +} + +require kubectl +require python3 +require base64 + +load_env_defaults "$PLATFORM_ENV_FILE" +load_env_defaults "$PLATFORM_RESOLVED_ENV_FILE" + +# Force orderbooks app identity after loading platform defaults. The platform +# env file may describe the platform repo itself, not this app repo. +PROJECT_NAME="${ORDERBOOKS_PROJECT_NAME:-orderbooks}" +PROJECT_NAMESPACE="${ORDERBOOKS_PROJECT_NAMESPACE:-orderbooks}" +PROJECT_DEPLOYMENTS="${ORDERBOOKS_PROJECT_DEPLOYMENTS:-orderbooks-collector}" +PROJECT_REGISTRY_SECRET_NAME="${ORDERBOOKS_PROJECT_REGISTRY_SECRET_NAME:-orderbooks-registry-creds}" +RCLONE_SECRET_NAME="${ORDERBOOKS_RCLONE_SECRET_NAME:-orderbooks-rclone-config}" +RCLONE_SECRET_KEY="${ORDERBOOKS_RCLONE_SECRET_KEY:-rclone.conf}" +FORGEJO_REPO_OWNER="${ORDERBOOKS_FORGEJO_REPO_OWNER:-philipp}" +FORGEJO_REPO_NAME="${ORDERBOOKS_FORGEJO_REPO_NAME:-orderbooks}" +FORGEJO_REPO_PRIVATE="${ORDERBOOKS_FORGEJO_REPO_PRIVATE:-0}" + +: "${KUBECONFIG_PATH:?missing kubeconfig path}" +: "${CI_KUBECONFIG_PATH:?missing CI kubeconfig path}" +[[ -f "$KUBECONFIG_PATH" ]] || { echo "missing kubeconfig file" >&2; exit 1; } +[[ -f "$CI_KUBECONFIG_PATH" ]] || { echo "missing in-cluster kubeconfig file" >&2; exit 1; } +export KUBECONFIG="$KUBECONFIG_PATH" + +if [[ -z "${FORGEJO_URL:-}" ]]; then + if [[ -n "${FORGEJO_ROOT_URL:-}" ]]; then + FORGEJO_URL="$FORGEJO_ROOT_URL" + elif [[ -n "${FORGEJO_DOMAIN:-}" ]]; then + FORGEJO_URL="https://${FORGEJO_DOMAIN}" + else + echo "missing Forgejo URL" >&2 + exit 1 + fi +fi + +: "${FORGEJO_ADMIN_USERNAME:?missing Forgejo admin username}" +if [[ -z "${FORGEJO_TOKEN:-}" ]]; then + : "${FORGEJO_ADMIN_PASSWORD:?missing Forgejo password or token}" +fi + +if [[ -z "${REGISTRY_HOST:-}" ]]; then + if [[ -n "${REGISTRY_DOMAIN:-}" ]]; then + REGISTRY_HOST="$REGISTRY_DOMAIN" + else + echo "missing registry host" >&2 + exit 1 + fi +fi +: "${REGISTRY_USERNAME:?missing registry username}" +: "${REGISTRY_PASSWORD:?missing registry password}" + +echo "ensuring namespace ${PROJECT_NAMESPACE}" +kubectl create namespace "$PROJECT_NAMESPACE" --dry-run=client -o yaml | kubectl apply -f - + +echo "upserting registry secret ${PROJECT_REGISTRY_SECRET_NAME}" +kubectl -n "$PROJECT_NAMESPACE" create secret docker-registry "$PROJECT_REGISTRY_SECRET_NAME" \ + --docker-server="$REGISTRY_HOST" \ + --docker-username="$REGISTRY_USERNAME" \ + --docker-password="$REGISTRY_PASSWORD" \ + --dry-run=client -o yaml | kubectl apply -f - + +echo "checking rclone secret key presence" +kubectl -n "$PROJECT_NAMESPACE" get secret "$RCLONE_SECRET_NAME" \ + -o "go-template={{if index .data \"${RCLONE_SECRET_KEY}\"}}rclone_secret_key_present{{else}}rclone_secret_key_missing{{end}}{{\"\\n\"}}" + +echo "upserting Forgejo repo and Actions settings" +forgejo_args=() +if [[ -n "${FORGEJO_TOKEN:-}" ]]; then + forgejo_args+=(--token "$FORGEJO_TOKEN") +else + forgejo_args+=(--admin-username "$FORGEJO_ADMIN_USERNAME" --admin-password "$FORGEJO_ADMIN_PASSWORD") +fi +if [[ "$FORGEJO_REPO_PRIVATE" == "1" || "$FORGEJO_REPO_PRIVATE" == "true" ]]; then + forgejo_args+=(--repo-private) +fi + +python3 "$ROOT_DIR/scripts/deploy/forgejo_repo_bootstrap.py" \ + --forgejo-url "$FORGEJO_URL" \ + --repo-owner "$FORGEJO_REPO_OWNER" \ + --repo-name "$FORGEJO_REPO_NAME" \ + --ci-kubeconfig "$CI_KUBECONFIG_PATH" \ + --registry-host "$REGISTRY_HOST" \ + --project-name "$PROJECT_NAME" \ + --project-namespace "$PROJECT_NAMESPACE" \ + --project-deployments "$PROJECT_DEPLOYMENTS" \ + --project-registry-secret-name "$PROJECT_REGISTRY_SECRET_NAME" \ + "${forgejo_args[@]}" + +echo "bootstrap complete for ${FORGEJO_REPO_OWNER}/${FORGEJO_REPO_NAME} in namespace ${PROJECT_NAMESPACE}" diff --git a/scripts/deploy/forgejo_repo_bootstrap.py b/scripts/deploy/forgejo_repo_bootstrap.py new file mode 100755 index 0000000..7c67c9b --- /dev/null +++ b/scripts/deploy/forgejo_repo_bootstrap.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +"""Orderbooks-specific Forgejo repo bootstrap. + +Creates/updates the Forgejo repository plus Actions settings for the Kubernetes +orderbooks deployment. This script deliberately does not print secret values. +""" + +from __future__ import annotations + +import argparse +import base64 +import json +import ssl +import urllib.error +import urllib.parse +import urllib.request +from pathlib import Path + + +class ForgejoClient: + def __init__(self, base_url: str, username: str | None = None, password: str | None = None, token: str | None = None): + self.base_url = base_url.rstrip('/') + self.username = username or '' + self.headers = {'Accept': 'application/json', 'Content-Type': 'application/json'} + if token: + self.headers['Authorization'] = f'token {token}' + elif username is not None and password is not None: + credentials = base64.b64encode(f'{username}:{password}'.encode()).decode() + self.headers['Authorization'] = f'Basic {credentials}' + else: + raise ValueError('ForgejoClient requires either token auth or username/password auth') + self.ssl_context = ssl.create_default_context() + + def request(self, method: str, path: str, payload=None, expected=(200, 201, 204)): + data = json.dumps(payload).encode() if payload is not None else None + req = urllib.request.Request(f'{self.base_url}{path}', data=data, method=method) + for key, value in self.headers.items(): + req.add_header(key, value) + try: + with urllib.request.urlopen(req, context=self.ssl_context) as response: + body = response.read().decode() if response.length != 0 else '' + if response.status not in expected: + raise RuntimeError(f'{method} {path} returned {response.status}: {body[:500]}') + return json.loads(body) if body else None + except urllib.error.HTTPError as exc: + body = exc.read().decode() + if exc.code not in expected: + raise RuntimeError(f'{method} {path} returned {exc.code}: {body[:500]}') from exc + return json.loads(body) if body else None + + def get_repo(self, owner: str, repo: str): + try: + return self.request('GET', f'/api/v1/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(repo)}') + except RuntimeError as exc: + if ' returned 404:' in str(exc): + return None + raise + + def create_repo(self, owner: str, name: str, private: bool): + payload = {'name': name, 'private': private, 'auto_init': False, 'default_branch': 'main'} + if owner == self.username: + return self.request('POST', '/api/v1/user/repos', payload, expected=(201,)) + return self.request('POST', f'/api/v1/orgs/{urllib.parse.quote(owner)}/repos', payload, expected=(201,)) + + def upsert_variable(self, owner: str, repo: str, name: str, value: str): + path = f'/api/v1/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(repo)}/actions/variables/{urllib.parse.quote(name)}' + try: + self.request('POST', path, {'value': value}, expected=(201, 204)) + except RuntimeError as exc: + if ' returned 409:' not in str(exc) and ' returned 422:' not in str(exc): + raise + self.request('PUT', path, {'value': value}, expected=(201, 204)) + + def upsert_secret(self, owner: str, repo: str, name: str, value: str): + path = f'/api/v1/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(repo)}/actions/secrets/{urllib.parse.quote(name)}' + self.request('PUT', path, {'data': value}, expected=(201, 204)) + + +def main() -> None: + parser = argparse.ArgumentParser(description='Bootstrap Forgejo Actions settings for orderbooks') + parser.add_argument('--forgejo-url', required=True) + parser.add_argument('--admin-username') + parser.add_argument('--admin-password') + parser.add_argument('--token') + parser.add_argument('--repo-owner', required=True) + parser.add_argument('--repo-name', required=True) + parser.add_argument('--repo-private', action='store_true') + parser.add_argument('--ci-kubeconfig', required=True) + parser.add_argument('--registry-host', required=True) + parser.add_argument('--project-name', required=True) + parser.add_argument('--project-namespace', required=True) + parser.add_argument('--project-deployments', required=True) + parser.add_argument('--project-registry-secret-name', required=True) + args = parser.parse_args() + + client = ForgejoClient(args.forgejo_url, args.admin_username, args.admin_password, args.token) + repo = client.get_repo(args.repo_owner, args.repo_name) + if repo is None: + created = client.create_repo(args.repo_owner, args.repo_name, args.repo_private) + print(f'created repo {created["full_name"]}') + else: + print(f'repo already exists: {repo["full_name"]}') + + kubeconfig_b64 = base64.b64encode(Path(args.ci_kubeconfig).read_bytes()).decode() + client.upsert_secret(args.repo_owner, args.repo_name, 'KUBECONFIG_B64', kubeconfig_b64) + print('upserted repo action secret KUBECONFIG_B64') + + variables = { + 'REGISTRY_HOST': args.registry_host, + 'PROJECT_NAME': args.project_name, + 'PROJECT_NAMESPACE': args.project_namespace, + 'PROJECT_DEPLOYMENTS': args.project_deployments, + 'PROJECT_REGISTRY_SECRET_NAME': args.project_registry_secret_name, + } + for name, value in variables.items(): + client.upsert_variable(args.repo_owner, args.repo_name, name, value) + print('upserted repo action variables') + + +if __name__ == '__main__': + main() diff --git a/scripts/discover_polymarket_btc_markets.py b/scripts/discover_polymarket_btc_markets.py new file mode 100755 index 0000000..7d3afa5 --- /dev/null +++ b/scripts/discover_polymarket_btc_markets.py @@ -0,0 +1,752 @@ +#!/usr/bin/env python3 +"""Discover active Polymarket BTC up/down markets. + +Checkpoint 3 scope: fetch bounded public Gamma metadata, preserve raw responses, +and write normalized market records with outcome-token mappings. This is not an +order-book collector. +""" + +from __future__ import annotations + +import argparse +import datetime as dt +import hashlib +import json +import sys +import time +import urllib.error +import urllib.parse +import urllib.request +from pathlib import Path +from typing import Any + + +GAMMA_EVENTS_URL = "https://gamma-api.polymarket.com/events" +BTC_TAG_ID = 235 + +DEFAULT_OUTPUT_JSON = Path("data/discovery/polymarket_btc_markets_latest.json") +DEFAULT_MANIFEST = Path("data/discovery/polymarket_btc_markets_manifest.json") +DEFAULT_MARKDOWN = Path("data/discovery/polymarket_btc_markets.md") + +SAFE_RESPONSE_HEADERS = { + "age", + "cache-control", + "cf-cache-status", + "cf-ray", + "content-encoding", + "content-length", + "content-type", + "date", + "expires", + "last-modified", + "ratelimit-limit", + "ratelimit-remaining", + "ratelimit-reset", + "retry-after", + "server", + "strict-transport-security", + "x-ratelimit-limit", + "x-ratelimit-remaining", + "x-ratelimit-reset", +} + +FILTER_RULES = [ + "Use public Gamma /events with tag_id=235, related_tags=true, active=true, closed=false.", + "Require event.active=true and event.closed=false.", + "Require market.active=true and market.closed=false.", + "Require market.enableOrderBook=true.", + "Require market.acceptingOrders=true unless --allow-non-accepting-orders is used.", + "Require market end time to be after the fetch time unless --allow-expired is used.", + "Require outcomes to resolve to exactly Up and Down.", + "Require clobTokenIds to resolve to exactly two token IDs.", + "Require BTC/up-down evidence from seriesSlug, title/slug text, or tags.", +] + + +def utc_now() -> dt.datetime: + return dt.datetime.now(dt.UTC) + + +def iso_z(value: dt.datetime | None = None) -> str: + value = value or utc_now() + return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def parse_iso(value: Any) -> dt.datetime | None: + if not isinstance(value, str) or not value.strip(): + return None + text = value.strip() + if text.endswith("Z"): + text = text[:-1] + "+00:00" + try: + parsed = dt.datetime.fromisoformat(text) + except ValueError: + return None + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=dt.UTC) + return parsed.astimezone(dt.UTC) + + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def filter_headers(headers: Any) -> dict[str, str]: + safe: dict[str, str] = {} + for key, value in dict(headers).items(): + if key.lower() in SAFE_RESPONSE_HEADERS: + safe[key] = value + return safe + + +def normalize_params(params: dict[str, Any]) -> dict[str, Any]: + normalized: dict[str, Any] = {} + for key, value in params.items(): + if isinstance(value, bool): + normalized[key] = "true" if value else "false" + else: + normalized[key] = value + return normalized + + +def build_url(url: str, params: dict[str, Any]) -> str: + query = urllib.parse.urlencode(normalize_params(params), doseq=True) + return f"{url}?{query}" + + +def fetch_json_page( + *, + name: str, + url: str, + params: dict[str, Any], + timeout_seconds: float, +) -> dict[str, Any]: + started_monotonic = time.monotonic() + started_at_utc = iso_z() + full_url = build_url(url, params) + request = urllib.request.Request( + full_url, + headers={ + "Accept": "application/json", + "User-Agent": "orderbooks-checkpoint-3-discovery/1.0", + }, + method="GET", + ) + status_code: int | None = None + response_headers: dict[str, str] = {} + response_text = "" + error: str | None = None + try: + with urllib.request.urlopen(request, timeout=timeout_seconds) as response: + status_code = response.status + response_headers = filter_headers(response.headers) + response_text = response.read().decode("utf-8", errors="replace") + except urllib.error.HTTPError as exc: + status_code = exc.code + response_headers = filter_headers(exc.headers) + response_text = exc.read().decode("utf-8", errors="replace") + error = f"HTTPError: {exc}" + except Exception as exc: # noqa: BLE001 - preserve probe failure evidence + error = f"{type(exc).__name__}: {exc}" + + response_json: Any | None = None + json_error: str | None = None + if response_text: + try: + response_json = json.loads(response_text) + except json.JSONDecodeError as exc: + json_error = str(exc) + + return { + "name": name, + "started_at_utc": started_at_utc, + "ended_at_utc": iso_z(), + "duration_ms": round((time.monotonic() - started_monotonic) * 1000, 3), + "request": { + "method": "GET", + "url": url, + "full_url": full_url, + "params": normalize_params(params), + }, + "response": { + "status_code": status_code, + "headers": response_headers, + "json": response_json, + "json_error": json_error, + "text_preview": response_text[:1000] if response_json is None else None, + }, + "ok": error is None and status_code is not None and 200 <= status_code < 300, + "error": error, + } + + +def coerce_json_array(value: Any) -> list[Any]: + if isinstance(value, list): + return value + if isinstance(value, str): + try: + parsed = json.loads(value) + except json.JSONDecodeError: + return [] + return parsed if isinstance(parsed, list) else [] + return [] + + +def lower_text(value: Any) -> str: + return str(value or "").lower() + + +def event_tag_text(event: dict[str, Any]) -> str: + parts: list[str] = [] + for tag in event.get("tags") or []: + if isinstance(tag, dict): + parts.append(str(tag.get("slug") or "")) + parts.append(str(tag.get("label") or "")) + return " ".join(parts).lower() + + +def has_btc_up_down_evidence(event: dict[str, Any], market: dict[str, Any]) -> bool: + series_slug = lower_text(event.get("seriesSlug")) + text = " ".join( + lower_text(event.get(key)) + for key in ("title", "slug", "ticker", "description") + ) + text += " " + " ".join( + lower_text(market.get(key)) + for key in ("question", "slug", "description") + ) + tags = event_tag_text(event) + series_match = series_slug.startswith("btc-up-or-down") + text_match = ("bitcoin" in text or "btc" in text) and "up" in text and "down" in text + tag_match = ("bitcoin" in tags or "btc" in tags) and "up-or-down" in tags + return bool(series_match or text_match or tag_match) + + +def is_up_down_outcomes(outcomes: list[str]) -> bool: + return len(outcomes) == 2 and {item.lower() for item in outcomes} == {"up", "down"} + + +def normalize_market( + *, + event: dict[str, Any], + market: dict[str, Any], + page_index: int, + event_index: int, + market_index: int, + fetched_at_utc: str, + output_json_path: Path, +) -> dict[str, Any]: + outcomes = [str(item) for item in coerce_json_array(market.get("outcomes"))] + token_ids = [str(item) for item in coerce_json_array(market.get("clobTokenIds"))] + tokens = [ + { + "outcome": outcomes[index], + "token_id": token_ids[index], + "outcome_index": index, + } + for index in range(min(len(outcomes), len(token_ids))) + ] + start_time = ( + market.get("startDate") + or market.get("startDateIso") + or event.get("startDate") + or event.get("creationDate") + ) + end_time = market.get("endDate") or market.get("endDateIso") or event.get("endDate") + event_slug = event.get("slug") + market_slug = market.get("slug") or event_slug + return { + "market_name": "polymarket", + "market_slug": market_slug, + "event_slug": event_slug, + "title": event.get("title") or market.get("question"), + "question": market.get("question") or event.get("title"), + "condition_id": market.get("conditionId"), + "tokens": tokens, + "outcomes": outcomes, + "start_time_utc": iso_z(parse_iso(start_time)) if parse_iso(start_time) else start_time, + "end_time_utc": iso_z(parse_iso(end_time)) if parse_iso(end_time) else end_time, + "active": market.get("active"), + "closed": market.get("closed"), + "event_active": event.get("active"), + "event_closed": event.get("closed"), + "accepting_orders": market.get("acceptingOrders"), + "enable_order_book": market.get("enableOrderBook"), + "endpoint_source": { + "name": "gamma_events_bitcoin_tag", + "method": "GET", + "url": GAMMA_EVENTS_URL, + "params_basis": { + "tag_id": BTC_TAG_ID, + "related_tags": "true", + "active": "true", + "closed": "false", + "order": "endDate", + "ascending": "true", + }, + }, + "fetched_at_utc": fetched_at_utc, + "raw_ref": { + "artifact_path": output_json_path.as_posix(), + "section": "raw.gamma_events_pages", + "page_index": page_index, + "event_index": event_index, + "market_index": market_index, + "json_path": f"raw.gamma_events_pages[{page_index}].response.json[{event_index}].markets[{market_index}]", + }, + } + + +def rejection_reasons( + *, + event: dict[str, Any], + market: dict[str, Any], + fetched_at: dt.datetime, + require_accepting_orders: bool, + require_future_end: bool, +) -> list[str]: + reasons: list[str] = [] + outcomes = [str(item) for item in coerce_json_array(market.get("outcomes"))] + token_ids = [str(item) for item in coerce_json_array(market.get("clobTokenIds"))] + end_time = parse_iso(market.get("endDate") or event.get("endDate")) + + if event.get("active") is not True: + reasons.append("event_not_active") + if event.get("closed") is not False: + reasons.append("event_closed") + if market.get("active") is not True: + reasons.append("market_not_active") + if market.get("closed") is not False: + reasons.append("market_closed") + if market.get("enableOrderBook") is not True: + reasons.append("order_book_not_enabled") + if require_accepting_orders and market.get("acceptingOrders") is not True: + reasons.append("not_accepting_orders") + if require_future_end and (end_time is None or end_time <= fetched_at): + reasons.append("not_future_end") + if not is_up_down_outcomes(outcomes): + reasons.append("not_up_down_outcomes") + if len(token_ids) != 2: + reasons.append("missing_two_clob_token_ids") + if not has_btc_up_down_evidence(event, market): + reasons.append("missing_btc_up_down_evidence") + return reasons + + +def discover(args: argparse.Namespace) -> dict[str, Any]: + started_at_utc = iso_z() + fetched_at = utc_now() + fetched_at_utc = iso_z(fetched_at) + raw_pages: list[dict[str, Any]] = [] + normalized: list[dict[str, Any]] = [] + rejected_counts: dict[str, int] = {} + warnings: list[str] = [] + seen_conditions: set[str] = set() + + for page_index in range(args.max_pages): + offset = page_index * args.limit + params = { + "tag_id": BTC_TAG_ID, + "related_tags": True, + "active": True, + "closed": False, + "limit": args.limit, + "offset": offset, + "order": "endDate", + "ascending": True, + } + page = fetch_json_page( + name=f"gamma_events_bitcoin_tag_page_{page_index}", + url=GAMMA_EVENTS_URL, + params=params, + timeout_seconds=args.timeout, + ) + raw_pages.append(page) + payload = page["response"]["json"] + if not page["ok"]: + warnings.append( + f"Page {page_index} request failed with status {page['response']['status_code']}: {page['error']}" + ) + break + if not isinstance(payload, list): + warnings.append(f"Page {page_index} response was not a JSON list.") + break + + for event_index, event in enumerate(payload): + if not isinstance(event, dict): + rejected_counts["event_not_object"] = rejected_counts.get("event_not_object", 0) + 1 + continue + markets = event.get("markets") or [] + if not isinstance(markets, list) or not markets: + rejected_counts["missing_markets"] = rejected_counts.get("missing_markets", 0) + 1 + continue + for market_index, market in enumerate(markets): + if not isinstance(market, dict): + rejected_counts["market_not_object"] = rejected_counts.get("market_not_object", 0) + 1 + continue + reasons = rejection_reasons( + event=event, + market=market, + fetched_at=fetched_at, + require_accepting_orders=not args.allow_non_accepting_orders, + require_future_end=not args.allow_expired, + ) + if reasons: + for reason in reasons: + rejected_counts[reason] = rejected_counts.get(reason, 0) + 1 + continue + condition_id = str(market.get("conditionId") or "") + if condition_id in seen_conditions: + rejected_counts["duplicate_condition_id"] = rejected_counts.get( + "duplicate_condition_id", 0 + ) + 1 + continue + seen_conditions.add(condition_id) + normalized.append( + normalize_market( + event=event, + market=market, + page_index=page_index, + event_index=event_index, + market_index=market_index, + fetched_at_utc=fetched_at_utc, + output_json_path=args.output_json, + ) + ) + + if len(payload) < args.limit: + break + + normalized.sort(key=lambda item: (item.get("end_time_utc") or "", item.get("market_slug") or "")) + if raw_pages: + last_payload = raw_pages[-1]["response"].get("json") + if isinstance(last_payload, list) and len(last_payload) == args.limit and len(raw_pages) >= args.max_pages: + warnings.append( + "Discovery stopped at max_pages before exhausting Gamma pagination; output is bounded to the fetched pages." + ) + if len(normalized) < args.min_markets: + warnings.append( + f"Only {len(normalized)} markets passed filters; min_markets={args.min_markets}." + ) + + status = "PASS" if len(normalized) >= args.min_markets else "FAIL" + status_reason = ( + f"Discovered {len(normalized)} active BTC up/down markets with condition IDs and two token IDs." + if status == "PASS" + else "Did not discover enough active BTC up/down markets with condition IDs and two token IDs." + ) + return { + "schema_name": "polymarket_btc_market_discovery", + "schema_version": 1, + "artifact_status": "valid" if status == "PASS" else "partial", + "checkpoint_id": 3, + "checkpoint_name": "Minimal BTC Market Discovery", + "started_at_utc": started_at_utc, + "ended_at_utc": iso_z(), + "fetched_at_utc": fetched_at_utc, + "scope": "Bounded public Gamma metadata discovery only; no order-book collector.", + "endpoint_basis": { + "source_checkpoint": "Checkpoint 2", + "source_report": "reports/checkpoints/checkpoint_002_polymarket_public_sources.md", + "endpoint": GAMMA_EVENTS_URL, + "method": "GET", + "base_params": { + "tag_id": BTC_TAG_ID, + "related_tags": True, + "active": True, + "closed": False, + "limit": args.limit, + "order": "endDate", + "ascending": True, + }, + }, + "filter_rules": FILTER_RULES, + "normalized_markets": normalized, + "raw": { + "gamma_events_pages": raw_pages, + }, + "summary": { + "status": status, + "status_reason": status_reason, + "raw_pages_fetched": len(raw_pages), + "raw_events_fetched": sum( + len(page["response"].get("json") or []) + for page in raw_pages + if isinstance(page["response"].get("json"), list) + ), + "normalized_market_count": len(normalized), + "rejected_counts": dict(sorted(rejected_counts.items())), + "warnings": warnings, + }, + "fake_progress_risk": "Discovery can appear successful while silently missing markets if filters rely on stale text assumptions or bounded pagination. Raw pages and rejection counts are preserved so missed-market risk can be audited.", + "next_step": "Checkpoint 4 should use this discovery output as input for a short, raw-first order-book snapshot sample; do not claim reliability until the later 24h soak test.", + } + + +def markdown_table_row(values: list[Any]) -> str: + return "| " + " | ".join(str(value).replace("\n", " ") for value in values) + " |" + + +def write_markdown(discovery: dict[str, Any], path: Path) -> None: + summary = discovery["summary"] + rows = discovery["normalized_markets"] + lines = [ + "# Polymarket BTC Markets Discovery", + "", + f"Artifact status: `{discovery['artifact_status']}`", + "", + "## Gate", + "", + f"Status: `{summary['status']}`", + "", + summary["status_reason"], + "", + "## Scope", + "", + "Bounded public Gamma metadata discovery only. No order-book collection, no trading, no private endpoints, no secrets.", + "", + "## Endpoint", + "", + f"- `GET {GAMMA_EVENTS_URL}`", + "- Params: `tag_id=235`, `related_tags=true`, `active=true`, `closed=false`, `order=endDate`, `ascending=true`, bounded by `limit` and `max_pages`.", + "", + "## Summary", + "", + markdown_table_row(["Metric", "Value"]), + markdown_table_row(["---", "---"]), + markdown_table_row(["fetched_at_utc", discovery["fetched_at_utc"]]), + markdown_table_row(["raw_pages_fetched", summary["raw_pages_fetched"]]), + markdown_table_row(["raw_events_fetched", summary["raw_events_fetched"]]), + markdown_table_row(["normalized_market_count", summary["normalized_market_count"]]), + "", + "## Markets", + "", + markdown_table_row( + [ + "market_slug", + "end_time_utc", + "condition_id", + "outcomes", + "token_ids", + "accepting_orders", + ] + ), + markdown_table_row(["---", "---", "---", "---", "---", "---"]), + ] + for row in rows: + token_ids = [token["token_id"] for token in row["tokens"]] + lines.append( + markdown_table_row( + [ + row.get("market_slug"), + row.get("end_time_utc"), + row.get("condition_id"), + json.dumps(row.get("outcomes")), + json.dumps(token_ids), + row.get("accepting_orders"), + ] + ) + ) + lines.extend( + [ + "", + "## Warnings", + "", + ] + ) + if summary["warnings"]: + for warning in summary["warnings"]: + lines.append(f"- {warning}") + else: + lines.append("- None.") + lines.extend( + [ + "", + "## Rejection Counts", + "", + "```json", + json.dumps(summary["rejected_counts"], indent=2, sort_keys=True), + "```", + "", + "## Raw Preservation", + "", + "The latest JSON artifact stores raw Gamma response envelopes under `raw.gamma_events_pages`. Each normalized record has a `raw_ref` pointing back to the source event market.", + "", + "## Strongest Fake-Progress Risk", + "", + discovery["fake_progress_risk"], + "", + "## Next Smallest Step", + "", + discovery["next_step"], + "", + ] + ) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines), encoding="utf-8") + + +def write_manifest( + *, + discovery: dict[str, Any], + manifest_path: Path, + output_json: Path, + markdown_path: Path, + command: str, +) -> None: + status = discovery["summary"]["status"] + output_files = [ + { + "path": output_json.as_posix(), + "kind": "latest_discovery_json", + "status": "valid" if output_json.exists() and output_json.stat().st_size else "missing", + "sha256": sha256_file(output_json) if output_json.exists() else None, + }, + { + "path": markdown_path.as_posix(), + "kind": "discovery_markdown", + "status": "valid" if markdown_path.exists() and markdown_path.stat().st_size else "missing", + "sha256": sha256_file(markdown_path) if markdown_path.exists() else None, + }, + ] + script_path = Path("scripts/discover_polymarket_btc_markets.py") + if script_path.exists(): + output_files.append( + { + "path": script_path.as_posix(), + "kind": "discovery_script", + "status": "valid", + "sha256": sha256_file(script_path), + } + ) + status_codes: dict[str, int] = {} + for page in discovery["raw"]["gamma_events_pages"]: + code = str(page["response"].get("status_code")) + status_codes[code] = status_codes.get(code, 0) + 1 + + manifest = { + "schema_name": "polymarket_btc_markets_manifest", + "schema_version": 1, + "checkpoint_id": 3, + "checkpoint_name": "Minimal BTC Market Discovery", + "status": status, + "started_at_utc": discovery["started_at_utc"], + "ended_at_utc": discovery["ended_at_utc"], + "scope": discovery["scope"], + "command": command, + "endpoint": discovery["endpoint_basis"], + "request_counts": { + "gamma_events_pages": discovery["summary"]["raw_pages_fetched"], + "status_code_counts": dict(sorted(status_codes.items())), + }, + "row_counts": { + "raw_events_fetched": discovery["summary"]["raw_events_fetched"], + "normalized_markets": discovery["summary"]["normalized_market_count"], + }, + "market_ids": [ + { + "market_slug": row.get("market_slug"), + "condition_id": row.get("condition_id"), + "token_ids": [token.get("token_id") for token in row.get("tokens", [])], + } + for row in discovery["normalized_markets"] + ], + "output_files": output_files, + "warnings": discovery["summary"]["warnings"], + "validation": { + "summary": discovery["summary"]["status_reason"], + "required_record_fields": [ + "market_name", + "market_slug", + "question", + "condition_id", + "tokens", + "outcomes", + "start_time_utc", + "end_time_utc", + "active", + "closed", + "accepting_orders", + "enable_order_book", + "endpoint_source", + "fetched_at_utc", + "raw_ref", + ], + }, + "fake_progress_risk": discovery["fake_progress_risk"], + "next_step": discovery["next_step"], + } + manifest_path.parent.mkdir(parents=True, exist_ok=True) + manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def write_outputs(args: argparse.Namespace, discovery: dict[str, Any]) -> None: + args.output_json.parent.mkdir(parents=True, exist_ok=True) + args.output_json.write_text( + json.dumps(discovery, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + write_markdown(discovery, args.markdown) + command = " ".join([Path(sys.argv[0]).as_posix(), *sys.argv[1:]]) + write_manifest( + discovery=discovery, + manifest_path=args.manifest, + output_json=args.output_json, + markdown_path=args.markdown, + command=command, + ) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Discover active BTC up/down Polymarket markets from public Gamma metadata." + ) + parser.add_argument("--output-json", type=Path, default=DEFAULT_OUTPUT_JSON) + parser.add_argument("--manifest", type=Path, default=DEFAULT_MANIFEST) + parser.add_argument("--markdown", type=Path, default=DEFAULT_MARKDOWN) + parser.add_argument("--limit", type=int, default=100) + parser.add_argument("--max-pages", type=int, default=3) + parser.add_argument("--timeout", type=float, default=15.0) + parser.add_argument("--min-markets", type=int, default=1) + parser.add_argument("--allow-expired", action="store_true") + parser.add_argument("--allow-non-accepting-orders", action="store_true") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + discovery = discover(args) + write_outputs(args, discovery) + print( + json.dumps( + { + "status": discovery["summary"]["status"], + "status_reason": discovery["summary"]["status_reason"], + "output_json": args.output_json.as_posix(), + "manifest": args.manifest.as_posix(), + "markdown": args.markdown.as_posix(), + "normalized_market_count": discovery["summary"]["normalized_market_count"], + "markets": [ + { + "market_slug": row.get("market_slug"), + "condition_id": row.get("condition_id"), + "token_ids": [token.get("token_id") for token in row.get("tokens", [])], + "end_time_utc": row.get("end_time_utc"), + } + for row in discovery["normalized_markets"] + ], + "warnings": discovery["summary"]["warnings"], + }, + indent=2, + sort_keys=True, + ) + ) + return 0 if discovery["summary"]["status"] == "PASS" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/k8s_runtime_smoke_check.sh b/scripts/k8s_runtime_smoke_check.sh new file mode 100755 index 0000000..9bff5d5 --- /dev/null +++ b/scripts/k8s_runtime_smoke_check.sh @@ -0,0 +1,466 @@ +#!/usr/bin/env bash +set -uo pipefail + +NAMESPACE="${ORDERBOOKS_K8S_NAMESPACE:-orderbooks}" +DEPLOYMENT="${ORDERBOOKS_K8S_COLLECTOR_DEPLOYMENT:-orderbooks-collector}" +CRONJOB="${ORDERBOOKS_K8S_UPLOADER_CRONJOB:-orderbooks-uploader}" +RAW_DIR="${ORDERBOOKS_K8S_RAW_DIR:-/var/lib/orderbooks/raw_orderbooks}" +MANIFEST_DIR="${ORDERBOOKS_K8S_MANIFEST_DIR:-/var/lib/orderbooks/manifests}" +WAIT_SECONDS="${ORDERBOOKS_K8S_SMOKE_WAIT_SECONDS:-1200}" +UPLOAD_MIN_AGE_SECONDS="${ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS:-600}" +KUBECTL_BIN="${ORDERBOOKS_KUBECTL:-kubectl}" +RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)" +EVIDENCE_PATH="${ORDERBOOKS_K8S_SMOKE_EVIDENCE_PATH:-data/manifests/k8s_runtime_smoke_${RUN_ID}.json}" + +usage() { + cat <<'EOF' +Usage: scripts/k8s_runtime_smoke_check.sh [options] + +Run after the orderbooks Kubernetes workload is deployed. The script uses +kubectl, writes local JSON evidence, deletes one collector pod to force a +Deployment restart, verifies raw gzip JSONL files and manifests on the PVC, +then triggers the uploader CronJob and requires a verified upload manifest. + +Options: + --namespace NAME Namespace. Default: orderbooks. + --deployment NAME Collector deployment. Default: orderbooks-collector. + --cronjob NAME Uploader CronJob. Default: orderbooks-uploader. + --raw-dir PATH Raw path inside collector pod. Default: /var/lib/orderbooks/raw_orderbooks. + --manifest-dir PATH Manifest path inside collector pod. Default: /var/lib/orderbooks/manifests. + --wait-seconds N Max wait for collector/upload evidence. Default: 1200. + --upload-min-age-seconds N + Wait for at least one raw/manifest file to be this old before upload. Default: 600. + --evidence-path PATH Local JSON evidence path. + --kubectl PATH kubectl binary. Default: kubectl. + --help Show this help. + +This script does not read or print rclone config contents. +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --namespace) NAMESPACE="$2"; shift 2 ;; + --deployment) DEPLOYMENT="$2"; shift 2 ;; + --cronjob) CRONJOB="$2"; shift 2 ;; + --raw-dir) RAW_DIR="$2"; shift 2 ;; + --manifest-dir) MANIFEST_DIR="$2"; shift 2 ;; + --wait-seconds) WAIT_SECONDS="$2"; shift 2 ;; + --upload-min-age-seconds) UPLOAD_MIN_AGE_SECONDS="$2"; shift 2 ;; + --evidence-path) EVIDENCE_PATH="$2"; shift 2 ;; + --kubectl) KUBECTL_BIN="$2"; shift 2 ;; + --help) usage; exit 0 ;; + *) echo "Unknown argument: $1" >&2; usage >&2; exit 2 ;; + esac +done + +mkdir -p "$(dirname "${EVIDENCE_PATH}")" + +PYTHONDONTWRITEBYTECODE=1 python3 - "$KUBECTL_BIN" "$NAMESPACE" "$DEPLOYMENT" "$CRONJOB" "$RAW_DIR" "$MANIFEST_DIR" "$WAIT_SECONDS" "$UPLOAD_MIN_AGE_SECONDS" "$EVIDENCE_PATH" <<'PY_SMOKE' +import datetime as dt +import json +import subprocess +import sys +import time +from pathlib import Path + +kubectl = sys.argv[1] +namespace = sys.argv[2] +deployment = sys.argv[3] +cronjob = sys.argv[4] +raw_dir = sys.argv[5] +manifest_dir = sys.argv[6] +wait_seconds = int(sys.argv[7]) +upload_min_age_seconds = int(sys.argv[8]) +evidence_path = Path(sys.argv[9]) +started_at = dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z') +checks = [] +failures = [] + +def iso_now(): + return dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z') + + +def capture(command, input_text=None, timeout=None): + proc = subprocess.run(command, input=input_text, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout) + item = { + 'command': command, + 'exit_code': proc.returncode, + 'stdout_tail': proc.stdout[-6000:], + 'stderr_tail': proc.stderr[-6000:], + 'ran_at_utc': iso_now(), + } + checks.append(item) + return proc, item + + +def run(command, input_text=None, timeout=None): + _proc, item = capture(command, input_text=input_text, timeout=timeout) + return item + + +def run_json(command, input_text=None, timeout=None): + proc, item = capture(command, input_text=input_text, timeout=timeout) + if item['exit_code'] != 0: + raise RuntimeError(f"command failed: {' '.join(command)}") + return json.loads(proc.stdout) + + +def pod_ready(pod): + if pod.get('status', {}).get('phase') != 'Running': + return False + statuses = pod.get('status', {}).get('containerStatuses') or [] + return bool(statuses) and all(status.get('ready') for status in statuses) + + +def get_collector_pod(): + selector = 'app.kubernetes.io/name=orderbooks,app.kubernetes.io/component=collector' + deadline = time.time() + wait_seconds + last = None + while time.time() <= deadline: + pods = run_json([kubectl, '-n', namespace, 'get', 'pods', '-l', selector, '-o', 'json']) + items = pods.get('items', []) + ready = [pod for pod in items if pod_ready(pod)] + if ready: + ready.sort(key=lambda pod: pod.get('metadata', {}).get('creationTimestamp', '')) + return ready[-1]['metadata']['name'], ready[-1] + last = items + time.sleep(10) + raise TimeoutError(f'no ready collector pod found; last pods={last}') + + +def exec_python(pod, code, args): + command = [kubectl, '-n', namespace, 'exec', '-i', pod, '--', 'python3', '-', *args] + proc, item = capture(command, input_text=code, timeout=wait_seconds + 60) + if item['exit_code'] != 0: + raise RuntimeError(f"pod python command failed in {pod}: {item['stderr_tail']}") + return json.loads(proc.stdout) + + +def wait_for_valid_collector(pod, after_mtime, label): + deadline = time.time() + wait_seconds + last_error = None + while time.time() <= deadline: + try: + result = exec_python(pod, collector_validation_code, [manifest_dir, raw_dir, str(after_mtime)]) + if result.get('valid'): + result['wait_label'] = label + return result + last_error = result + except Exception as exc: + last_error = repr(exc) + time.sleep(15) + raise TimeoutError(f'no valid {label} collector manifest found before timeout: {last_error}') + + +def wait_for_upload_eligible_files(pod): + deadline = time.time() + wait_seconds + last = None + while time.time() <= deadline: + result = exec_python(pod, upload_eligibility_code, [raw_dir, manifest_dir, str(upload_min_age_seconds)]) + if result.get('eligible'): + return result + last = result + time.sleep(15) + raise TimeoutError(f'no upload-eligible raw/manifest files before timeout: {last}') + +collector_validation_code = r''' +import gzip +import hashlib +import json +import sys +from pathlib import Path + +manifest_dir = Path(sys.argv[1]) +raw_dir = Path(sys.argv[2]) +after_mtime = float(sys.argv[3]) + +def sha256(path): + digest = hashlib.sha256() + with path.open('rb') as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b''): + digest.update(chunk) + return digest.hexdigest() + + +def parse_raw(path): + rows = 0 + first_keys = [] + with gzip.open(path, 'rt', encoding='utf-8') as handle: + for line in handle: + if not line.strip(): + continue + obj = json.loads(line) + if rows == 0: + first_keys = sorted(obj.keys()) + rows += 1 + return rows, first_keys + + +def validate(path): + manifest = json.loads(path.read_text(encoding='utf-8')) + output_files = [] + for item in manifest.get('output_files', []): + raw_path = Path(item['path']) + rows, first_keys = parse_raw(raw_path) + actual_sha = sha256(raw_path) + output_files.append({ + 'path': str(raw_path), + 'bytes': raw_path.stat().st_size, + 'mtime': raw_path.stat().st_mtime, + 'manifest_rows': item.get('rows'), + 'rows_parsed': rows, + 'row_count_matches_manifest': rows == item.get('rows'), + 'manifest_sha256': item.get('sha256'), + 'actual_sha256': actual_sha, + 'sha256_matches_manifest': actual_sha == item.get('sha256'), + 'under_raw_dir': raw_path.resolve().is_relative_to(raw_dir.resolve()), + 'first_row_keys': first_keys, + }) + valid = ( + manifest.get('gate_status') == 'PASS' + and manifest.get('rows_written', 0) > 0 + and manifest.get('failure_count') == 0 + and not manifest.get('failures') + and bool(output_files) + and all(item['rows_parsed'] > 0 and item['row_count_matches_manifest'] and item['sha256_matches_manifest'] and item['under_raw_dir'] for item in output_files) + ) + return { + 'path': str(path), + 'mtime': path.stat().st_mtime, + 'manifest_summary': { + 'gate_status': manifest.get('gate_status'), + 'rows_written': manifest.get('rows_written'), + 'failure_count': manifest.get('failure_count'), + 'failures_present': bool(manifest.get('failures')), + 'output_file_count': len(manifest.get('output_files', [])), + 'started_at_utc': manifest.get('started_at_utc'), + 'ended_at_utc': manifest.get('ended_at_utc'), + }, + 'output_files': output_files, + 'valid': valid, + } + +candidates = sorted(manifest_dir.glob('polymarket_orderbook_collector_*.json'), key=lambda p: p.stat().st_mtime) +candidates = [path for path in candidates if path.stat().st_mtime > after_mtime] +latest = None +for path in reversed(candidates): + try: + result = validate(path) + except Exception as exc: + latest = {'path': str(path), 'valid': False, 'error': repr(exc)} + continue + latest = result + if result['valid']: + print(json.dumps(result, sort_keys=True)) + sys.exit(0) +print(json.dumps(latest or {'valid': False, 'error': 'no collector manifest candidates'}, sort_keys=True)) +sys.exit(2) +''' + +raw_check_code = r''' +import gzip +import hashlib +import json +import sys +from pathlib import Path + +path = Path(sys.argv[1]) +expected_sha = sys.argv[2] +expected_rows = int(sys.argv[3]) + +def sha256(path): + digest = hashlib.sha256() + with path.open('rb') as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b''): + digest.update(chunk) + return digest.hexdigest() + +rows = 0 +with gzip.open(path, 'rt', encoding='utf-8') as handle: + for line in handle: + if line.strip(): + json.loads(line) + rows += 1 +actual_sha = sha256(path) +print(json.dumps({ + 'path': str(path), + 'expected_sha256': expected_sha, + 'actual_sha256': actual_sha, + 'sha256_matches': actual_sha == expected_sha, + 'expected_rows': expected_rows, + 'actual_rows': rows, + 'row_count_matches': rows == expected_rows, +}, sort_keys=True)) +''' + +upload_validation_code = r''' +import json +import sys +from pathlib import Path + +manifest_dir = Path(sys.argv[1]) +after_mtime = float(sys.argv[2]) +candidates = sorted(manifest_dir.glob('upload_archive_*.json'), key=lambda p: p.stat().st_mtime) +candidates = [path for path in candidates if path.stat().st_mtime >= after_mtime] +if not candidates: + print(json.dumps({'valid': False, 'error': 'no upload manifest candidates'}, sort_keys=True)) + sys.exit(2) +path = candidates[-1] +manifest = json.loads(path.read_text(encoding='utf-8')) +verified_count = manifest.get('counts', {}).get('verified', len(manifest.get('verified_files', []))) +valid = ( + manifest.get('operation_status') == 'UPLOAD_VERIFIED' + and manifest.get('gate_status') == 'PASS' + and manifest.get('rclone', {}).get('copy_exit_code') == 0 + and manifest.get('rclone', {}).get('check_exit_code') == 0 + and verified_count > 0 +) +verified_files = manifest.get('verified_files', []) +print(json.dumps({ + 'path': str(path), + 'mtime': path.stat().st_mtime, + 'manifest_summary': { + 'operation_status': manifest.get('operation_status'), + 'gate_status': manifest.get('gate_status'), + 'counts': manifest.get('counts', {}), + 'planned_file_count': len(manifest.get('planned_files', [])), + 'attempted_file_count': len(manifest.get('attempted_files', [])), + 'uploaded_file_count': len(manifest.get('uploaded_files', [])), + 'verified_file_count': verified_count, + 'rclone_copy_exit_code': manifest.get('rclone', {}).get('copy_exit_code'), + 'rclone_check_exit_code': manifest.get('rclone', {}).get('check_exit_code'), + 'started_at_utc': manifest.get('started_at_utc'), + 'ended_at_utc': manifest.get('ended_at_utc'), + }, + 'verified_count': verified_count, + 'verified_file_samples': [ + { + 'relative_path': item.get('relative_path'), + 'bytes': item.get('bytes'), + 'sha256': item.get('sha256'), + 'kind': item.get('kind'), + } + for item in verified_files[:5] + ], + 'valid': valid, +}, sort_keys=True)) +if not valid: + sys.exit(2) +''' + +upload_eligibility_code = r''' +import json +import sys +import time +from pathlib import Path + +raw_dir = Path(sys.argv[1]) +manifest_dir = Path(sys.argv[2]) +min_age_seconds = int(sys.argv[3]) +now = time.time() + +def eligible_files(root, pattern): + if not root.exists(): + return [] + items = [] + for path in sorted(root.rglob(pattern)): + if not path.is_file(): + continue + age = max(0, int(now - path.stat().st_mtime)) + if age >= min_age_seconds: + items.append({'path': str(path), 'bytes': path.stat().st_size, 'age_seconds': age}) + return items + +raw_files = eligible_files(raw_dir, '*.jsonl.gz') +manifest_files = eligible_files(manifest_dir, 'polymarket_orderbook_collector_*.json') +print(json.dumps({ + 'eligible': bool(raw_files) and bool(manifest_files), + 'min_age_seconds': min_age_seconds, + 'raw_eligible_count': len(raw_files), + 'manifest_eligible_count': len(manifest_files), + 'raw_sample': raw_files[:3], + 'manifest_sample': manifest_files[:3], +}, sort_keys=True)) +''' + +summary = { + 'schema_name': 'k8s_runtime_smoke_result', + 'schema_version': 1, + 'started_at_utc': started_at, + 'ended_at_utc': None, + 'gate_status': 'ERROR', + 'production_ready': False, + 'namespace': namespace, + 'deployment': deployment, + 'cronjob': cronjob, + 'raw_dir': raw_dir, + 'manifest_dir': manifest_dir, + 'upload_min_age_seconds': upload_min_age_seconds, + 'checks': checks, + 'failures': failures, +} + +try: + rollout = run([kubectl, '-n', namespace, 'rollout', 'status', f'deployment/{deployment}', f'--timeout={wait_seconds}s']) + if rollout['exit_code'] != 0: + raise RuntimeError('collector deployment rollout is not healthy') + pod_name, pod_obj = get_collector_pod() + before = wait_for_valid_collector(pod_name, 0, 'initial') + before_mtime = before['mtime'] + old_file = before['output_files'][0] + + delete_pod = run([kubectl, '-n', namespace, 'delete', 'pod', pod_name, '--wait=false']) + if delete_pod['exit_code'] != 0: + raise RuntimeError('failed to delete collector pod for restart test') + rollout_after = run([kubectl, '-n', namespace, 'rollout', 'status', f'deployment/{deployment}', f'--timeout={wait_seconds}s']) + if rollout_after['exit_code'] != 0: + raise RuntimeError('collector deployment did not recover after pod delete') + new_pod, new_pod_obj = get_collector_pod() + old_check = exec_python(new_pod, raw_check_code, [old_file['path'], old_file['actual_sha256'], str(old_file['rows_parsed'])]) + if not old_check.get('sha256_matches') or not old_check.get('row_count_matches'): + raise RuntimeError('old raw file changed or stopped parsing after pod restart') + + after = wait_for_valid_collector(new_pod, before_mtime, 'post_restart') + upload_eligibility = wait_for_upload_eligible_files(new_pod) + + upload_start_mtime = time.time() - 2 + job_name = 'orderbooks-uploader-smoke-' + dt.datetime.now(dt.UTC).strftime('%Y%m%dt%H%M%Sz').lower() + run([kubectl, '-n', namespace, 'delete', 'job', job_name, '--ignore-not-found=true']) + create_job = run([kubectl, '-n', namespace, 'create', 'job', job_name, f'--from=cronjob/{cronjob}']) + if create_job['exit_code'] != 0: + raise RuntimeError('failed to create uploader smoke job from CronJob') + wait_upload = run([kubectl, '-n', namespace, 'wait', '--for=condition=Complete', f'--timeout={wait_seconds}s', f'job/{job_name}']) + logs = run([kubectl, '-n', namespace, 'logs', f'job/{job_name}']) + if wait_upload['exit_code'] != 0: + raise RuntimeError('uploader smoke job did not complete') + upload = exec_python(new_pod, upload_validation_code, [manifest_dir, str(upload_start_mtime)]) + if not upload.get('valid'): + raise RuntimeError('upload manifest did not verify at least one file') + + summary.update({ + 'initial_collector_pod': pod_name, + 'post_restart_collector_pod': new_pod, + 'before_restart_collector': before, + 'old_raw_file_after_restart': old_check, + 'after_restart_collector': after, + 'upload_eligibility': upload_eligibility, + 'uploader_job': job_name, + 'upload_result': upload, + 'uploader_log_check_exit_code': logs['exit_code'], + }) + summary['gate_status'] = 'PASS' +except Exception as exc: + failures.append(str(exc)) + summary['exception'] = repr(exc) + summary['gate_status'] = 'FAIL' +finally: + summary['ended_at_utc'] = iso_now() + evidence_path.parent.mkdir(parents=True, exist_ok=True) + evidence_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + '\n', encoding='utf-8') + +print(f'K8S_SMOKE_EVIDENCE={evidence_path}') +print(f'K8S_SMOKE_GATE={summary["gate_status"]}') +if summary['gate_status'] != 'PASS': + sys.exit(1) +PY_SMOKE diff --git a/scripts/normalize_polymarket_orderbooks.py b/scripts/normalize_polymarket_orderbooks.py new file mode 100644 index 0000000..5af88eb --- /dev/null +++ b/scripts/normalize_polymarket_orderbooks.py @@ -0,0 +1,496 @@ +#!/usr/bin/env python3 +"""Normalize raw Polymarket order-book snapshots from the sample collector. + +Checkpoint 5 scope: derive a bounded normalized gzip JSONL sample from the raw +Checkpoint 4 sample. Raw files remain the source of truth; every normalized row +keeps the raw file path and gzip JSONL line number. +""" + +from __future__ import annotations + +import argparse +import datetime as dt +import gzip +import hashlib +import json +import sys +from decimal import Decimal, InvalidOperation, getcontext +from pathlib import Path +from typing import Any + + +NORMALIZER_NAME = "polymarket_orderbook_normalizer" +NORMALIZER_VERSION = "0.1.0" +SCHEMA_NAME = "normalized_orderbook_snapshot" +SCHEMA_VERSION = 1 + +DEFAULT_INPUT_MANIFEST = Path("data/manifests/orderbook_collector_sample_manifest.json") +DEFAULT_OUTPUT_DIR = Path("data/normalized_sample") +DEFAULT_MANIFEST_PATH = Path("data/manifests/orderbook_normalization_sample_manifest.json") + +CENT_OFFSETS = { + "1c": Decimal("0.01"), + "2c": Decimal("0.02"), + "5c": Decimal("0.05"), +} + +SECRET_PATTERNS = ( + "set-" "coo" "kie", + "__cf" "_bm", + "cf" "_bm", + "author" "ization", + "private" "_key", + "api" "_secret", + "poly" "_signature", + "poly" "_passphrase", + "poly" "_address", + "bear" "er", + "coo" "kie", + "wallet" " material", +) + + +getcontext().prec = 50 + + +def utc_now() -> dt.datetime: + return dt.datetime.now(dt.UTC) + + +def iso_z(value: dt.datetime | None = None) -> str: + value = value or utc_now() + return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def compact_timestamp(value: dt.datetime | None = None) -> str: + value = value or utc_now() + return value.astimezone(dt.UTC).strftime("%Y%m%dT%H%M%SZ") + + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def decimal_from_raw(value: Any, field_name: str) -> Decimal: + if not isinstance(value, str): + raise ValueError(f"{field_name} is not a string: {value!r}") + try: + parsed = Decimal(value) + except InvalidOperation as exc: + raise ValueError(f"{field_name} is not a decimal: {value!r}") from exc + if not parsed.is_finite(): + raise ValueError(f"{field_name} is not finite: {value!r}") + return parsed + + +def decimal_to_json(value: Decimal | None) -> str | None: + if value is None: + return None + if value == 0: + return "0" + return format(value.normalize(), "f") + + +def load_json(path: Path) -> dict[str, Any]: + with path.open("r", encoding="utf-8") as handle: + data = json.load(handle) + if not isinstance(data, dict): + raise ValueError(f"{path} did not contain a JSON object") + return data + + +def resolve_repo_path(path_text: str) -> Path: + path = Path(path_text) + if path.is_absolute(): + return path + return Path.cwd() / path + + +def normalize_side(levels: Any, side_name: str) -> list[tuple[Decimal, Decimal]]: + if not isinstance(levels, list): + raise ValueError(f"raw.{side_name} is not a list") + normalized: list[tuple[Decimal, Decimal]] = [] + for index, level in enumerate(levels): + if not isinstance(level, dict): + raise ValueError(f"raw.{side_name}[{index}] is not an object") + price = decimal_from_raw(level.get("price"), f"raw.{side_name}[{index}].price") + size = decimal_from_raw(level.get("size"), f"raw.{side_name}[{index}].size") + if size < 0: + raise ValueError(f"raw.{side_name}[{index}].size is negative") + normalized.append((price, size)) + return normalized + + +def sum_sizes(levels: list[tuple[Decimal, Decimal]]) -> Decimal: + return sum((size for _, size in levels), Decimal("0")) + + +def normalize_raw_row(raw_row: dict[str, Any], raw_file: str, raw_line_number: int) -> dict[str, Any]: + raw_book = raw_row.get("raw") + market = raw_row.get("market") + collection = raw_row.get("collection") + if not isinstance(raw_book, dict): + raise ValueError("raw is not an object") + if not isinstance(market, dict): + raise ValueError("market is not an object") + if not isinstance(collection, dict): + raise ValueError("collection is not an object") + + bids = normalize_side(raw_book.get("bids"), "bids") + asks = normalize_side(raw_book.get("asks"), "asks") + + best_bid = max((price for price, _ in bids), default=None) + best_ask = min((price for price, _ in asks), default=None) + spread = None + midpoint = None + if best_bid is not None and best_ask is not None: + spread = best_ask - best_bid + midpoint = (best_bid + best_ask) / Decimal("2") + + bid_depth_total = sum_sizes(bids) + ask_depth_total = sum_sizes(asks) + + row: dict[str, Any] = { + "schema_name": SCHEMA_NAME, + "schema_version": SCHEMA_VERSION, + "market_name": market.get("market_name"), + "market_slug": market.get("market_slug"), + "condition_id": market.get("condition_id"), + "token_id": market.get("token_id"), + "outcome": market.get("outcome"), + "collected_at_utc": collection.get("collected_at_utc"), + "best_bid": decimal_to_json(best_bid), + "best_ask": decimal_to_json(best_ask), + "spread": decimal_to_json(spread), + "midpoint": decimal_to_json(midpoint), + "bid_depth_total": decimal_to_json(bid_depth_total), + "ask_depth_total": decimal_to_json(ask_depth_total), + "raw_file": raw_file, + "raw_line_number": raw_line_number, + } + + for label, offset in CENT_OFFSETS.items(): + bid_depth = Decimal("0") + if best_bid is not None: + threshold = best_bid - offset + bid_depth = sum((size for price, size in bids if price >= threshold), Decimal("0")) + ask_depth = Decimal("0") + if best_ask is not None: + threshold = best_ask + offset + ask_depth = sum((size for price, size in asks if price <= threshold), Decimal("0")) + row[f"bid_depth_within_{label}"] = decimal_to_json(bid_depth) + row[f"ask_depth_within_{label}"] = decimal_to_json(ask_depth) + + return row + + +def summarize_output(path: Path, rows: int) -> dict[str, Any]: + return { + "path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path), + "rows": rows, + "bytes": path.stat().st_size, + "sha256": sha256_file(path), + "status": "valid", + } + + +def build_input_file_summary(manifest: dict[str, Any]) -> list[dict[str, Any]]: + files = manifest.get("output_files") + if not isinstance(files, list) or not files: + raise ValueError("input manifest has no output_files") + summaries: list[dict[str, Any]] = [] + for file_entry in files: + if not isinstance(file_entry, dict): + raise ValueError("input manifest output_files entry is not an object") + path_text = file_entry.get("path") + if not isinstance(path_text, str) or not path_text: + raise ValueError("input manifest output_files entry lacks path") + path = resolve_repo_path(path_text) + if not path.exists(): + raise FileNotFoundError(path) + actual_sha = sha256_file(path) + expected_sha = file_entry.get("sha256") + checksum_match = expected_sha == actual_sha + summaries.append( + { + "path": path_text, + "rows_expected": file_entry.get("rows"), + "bytes": path.stat().st_size, + "sha256": actual_sha, + "input_manifest_sha256": expected_sha, + "checksum_match": checksum_match, + "status": "valid" if checksum_match else "invalid", + } + ) + return summaries + + +def read_and_normalize( + input_files: list[dict[str, Any]], + output_path: Path, +) -> tuple[int, int, list[dict[str, Any]], dict[str, Any]]: + raw_rows_read = 0 + normalized_rows_written = 0 + errors: list[dict[str, Any]] = [] + sanity = { + "raw_file_refs_present": True, + "raw_files_exist": True, + "spread_non_negative": True, + "midpoint_between_bid_ask": True, + "depth_totals_non_negative": True, + "outcomes_seen": [], + "gzip_jsonl_parseable": True, + "row_count_match": None, + } + outcomes_seen: set[str] = set() + + output_path.parent.mkdir(parents=True, exist_ok=True) + with gzip.open(output_path, "wt", encoding="utf-8", compresslevel=9) as output: + for file_entry in input_files: + raw_file = file_entry["path"] + raw_path = resolve_repo_path(raw_file) + if not raw_path.exists(): + sanity["raw_files_exist"] = False + errors.append({"raw_file": raw_file, "error": "raw file missing"}) + continue + + with gzip.open(raw_path, "rt", encoding="utf-8") as raw_handle: + for raw_line_number, line in enumerate(raw_handle, 1): + raw_rows_read += 1 + try: + raw_row = json.loads(line) + normalized = normalize_raw_row(raw_row, raw_file, raw_line_number) + output.write(json.dumps(normalized, sort_keys=True, separators=(",", ":")) + "\n") + normalized_rows_written += 1 + + if not normalized.get("raw_file") or not normalized.get("raw_line_number"): + sanity["raw_file_refs_present"] = False + if not resolve_repo_path(str(normalized["raw_file"])).exists(): + sanity["raw_files_exist"] = False + outcome = normalized.get("outcome") + if isinstance(outcome, str): + outcomes_seen.add(outcome) + + best_bid = Decimal(normalized["best_bid"]) if normalized["best_bid"] is not None else None + best_ask = Decimal(normalized["best_ask"]) if normalized["best_ask"] is not None else None + spread = Decimal(normalized["spread"]) if normalized["spread"] is not None else None + midpoint = Decimal(normalized["midpoint"]) if normalized["midpoint"] is not None else None + if best_bid is not None and best_ask is not None: + if spread is None or spread < 0: + sanity["spread_non_negative"] = False + if midpoint is None or midpoint < best_bid or midpoint > best_ask: + sanity["midpoint_between_bid_ask"] = False + depth_fields = [ + "bid_depth_total", + "ask_depth_total", + "bid_depth_within_1c", + "ask_depth_within_1c", + "bid_depth_within_2c", + "ask_depth_within_2c", + "bid_depth_within_5c", + "ask_depth_within_5c", + ] + for field in depth_fields: + if Decimal(normalized[field]) < 0: + sanity["depth_totals_non_negative"] = False + except Exception as exc: # noqa: BLE001 - preserve row-level failure evidence. + errors.append( + { + "raw_file": raw_file, + "raw_line_number": raw_line_number, + "error": str(exc), + } + ) + + sanity["outcomes_seen"] = sorted(outcomes_seen) + sanity["has_up_and_down"] = {"Up", "Down"}.issubset(outcomes_seen) + sanity["row_count_match"] = raw_rows_read == normalized_rows_written + len(errors) + return raw_rows_read, normalized_rows_written, errors, sanity + + +def validate_output_gzip_jsonl(path: Path) -> tuple[bool, int, list[str]]: + errors: list[str] = [] + parsed_rows = 0 + try: + with gzip.open(path, "rt", encoding="utf-8") as handle: + for line_number, line in enumerate(handle, 1): + json.loads(line) + parsed_rows = line_number + except Exception as exc: # noqa: BLE001 - validation result belongs in manifest. + errors.append(str(exc)) + return not errors, parsed_rows, errors + + +def scan_for_secret_terms(paths: list[Path]) -> dict[str, Any]: + matches: list[dict[str, Any]] = [] + lowered_patterns = tuple(pattern.lower() for pattern in SECRET_PATTERNS) + for path in paths: + if not path.exists(): + continue + if path.suffix == ".gz": + opener = gzip.open + else: + opener = open + with opener(path, "rt", encoding="utf-8", errors="replace") as handle: # type: ignore[arg-type] + for line_number, line in enumerate(handle, 1): + lower = line.lower() + for pattern_index, pattern in enumerate(lowered_patterns, 1): + if pattern in lower: + matches.append( + { + "path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path), + "line_number": line_number, + "term_index": pattern_index, + } + ) + break + return { + "passed": not matches, + "checked_term_count": len(SECRET_PATTERNS), + "matches": matches, + } + + +def parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Normalize Checkpoint 4 raw Polymarket order-book snapshots.", + ) + parser.add_argument( + "--input-manifest", + type=Path, + default=DEFAULT_INPUT_MANIFEST, + help=f"Raw collector manifest path. Default: {DEFAULT_INPUT_MANIFEST}", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=DEFAULT_OUTPUT_DIR, + help=f"Normalized sample base directory. Default: {DEFAULT_OUTPUT_DIR}", + ) + parser.add_argument( + "--manifest-path", + type=Path, + default=DEFAULT_MANIFEST_PATH, + help=f"Normalization manifest path. Default: {DEFAULT_MANIFEST_PATH}", + ) + return parser.parse_args(argv) + + +def main(argv: list[str]) -> int: + args = parse_args(argv) + started = utc_now() + input_manifest = load_json(args.input_manifest) + input_files = build_input_file_summary(input_manifest) + + run_id = compact_timestamp(started) + output_path = ( + args.output_dir + / "polymarket" + / "orderbooks" + / run_id + / f"polymarket_orderbooks_normalized_{run_id}.jsonl.gz" + ) + + raw_rows_read, normalized_rows_written, row_errors, sanity = read_and_normalize(input_files, output_path) + gzip_ok, gzip_rows, gzip_errors = validate_output_gzip_jsonl(output_path) + output_summary = summarize_output(output_path, normalized_rows_written) + + sanity.update( + { + "output_row_count_equals_raw_input_row_count": normalized_rows_written == raw_rows_read + if not row_errors + else False, + "gzip_jsonl_decompresses_and_parses": gzip_ok, + "gzip_jsonl_rows_parsed": gzip_rows, + "gzip_jsonl_errors": gzip_errors, + "manifest_checksum_matches_output": output_summary["sha256"] == sha256_file(output_path), + "all_input_file_checksums_match": all(file_entry["checksum_match"] for file_entry in input_files), + } + ) + + secret_scan = scan_for_secret_terms([Path(__file__), output_path]) + sanity["checkpoint5_secret_scan_passed"] = secret_scan["passed"] + + gate_checks = [ + normalized_rows_written == raw_rows_read, + not row_errors, + sanity["raw_file_refs_present"], + sanity["raw_files_exist"], + sanity["spread_non_negative"], + sanity["midpoint_between_bid_ask"], + sanity["depth_totals_non_negative"], + sanity["has_up_and_down"], + gzip_ok, + sanity["manifest_checksum_matches_output"], + secret_scan["passed"], + all(file_entry["checksum_match"] for file_entry in input_files), + ] + gate_status = "PASS" if all(gate_checks) and normalized_rows_written > 0 else "FAIL" + ended = utc_now() + + manifest = { + "schema_name": "orderbook_normalization_sample_manifest", + "schema_version": 1, + "checkpoint_id": 5, + "checkpoint_name": "Normalized Snapshot Extract", + "normalizer": { + "name": NORMALIZER_NAME, + "version": NORMALIZER_VERSION, + }, + "started_at_utc": iso_z(started), + "ended_at_utc": iso_z(ended), + "run_duration_seconds": round((ended - started).total_seconds(), 3), + "command": "scripts/normalize_polymarket_orderbooks.py", + "input_manifest": { + "path": str(args.input_manifest), + "sha256": sha256_file(args.input_manifest), + "collector_manifest_schema_name": input_manifest.get("schema_name"), + "collector_gate_status": input_manifest.get("gate_status"), + }, + "input_files": input_files, + "output_files": [output_summary], + "raw_rows_read": raw_rows_read, + "normalized_rows_written": normalized_rows_written, + "skipped_rows": len(row_errors), + "error_rows": row_errors, + "numeric_encoding": "Exact decimal values are emitted as JSON strings; missing price-derived values are null.", + "sanity_checks": sanity, + "secret_scan": secret_scan, + "warnings": [], + "known_gaps": [ + "This is a derived sample extract only; raw gzip JSONL remains the source of truth.", + "No upload, daemon runtime, systemd unit, dashboard, database, strategy, backtest, or trading behavior is included.", + "The sample proves normalization logic on one bounded raw run, not long-run schema stability.", + ], + "fake_progress_risk": "A clean normalized sample can hide raw collection gaps and endpoint schema drift; every row is therefore traceable to raw_file and raw_line_number, and reliability remains gated on later soak testing.", + "next_step": "Checkpoint 6 should package the raw collector for a VPS runtime, or the orchestrator can request review of this normalized sample first.", + "gate_status": gate_status, + } + + args.manifest_path.parent.mkdir(parents=True, exist_ok=True) + args.manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + print( + json.dumps( + { + "gate_status": gate_status, + "manifest_path": str(args.manifest_path), + "output_path": str(output_path), + "raw_rows_read": raw_rows_read, + "normalized_rows_written": normalized_rows_written, + "skipped_rows": len(row_errors), + "sha256": output_summary["sha256"], + }, + indent=2, + sort_keys=True, + ) + ) + return 0 if gate_status == "PASS" else 1 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/scripts/probe_polymarket_public_sources.py b/scripts/probe_polymarket_public_sources.py new file mode 100755 index 0000000..5fb7375 --- /dev/null +++ b/scripts/probe_polymarket_public_sources.py @@ -0,0 +1,1369 @@ +#!/usr/bin/env python3 +"""Bounded public-source probe for Polymarket Checkpoint 2. + +This is not a collector. It performs a small, finite set of public requests to +prove which endpoints can support a future raw-first order book archive. +""" + +from __future__ import annotations + +import argparse +import base64 +import datetime as dt +import hashlib +import json +import os +import socket +import ssl +import struct +import sys +import time +import urllib.error +import urllib.parse +import urllib.request +from pathlib import Path +from typing import Any + + +GAMMA_BASE = "https://gamma-api.polymarket.com" +CLOB_BASE = "https://clob.polymarket.com" +DATA_API_BASE = "https://data-api.polymarket.com" +MARKET_WS_URL = "wss://ws-subscriptions-clob.polymarket.com/ws/market" + +DEFAULT_PROBE_JSON = Path("data/probes/polymarket_public_sources_probe_v1.json") +DEFAULT_PROBE_MD = Path("data/probes/polymarket_public_sources_probe_v1.md") +DEFAULT_CHECKPOINT_REPORT = Path( + "reports/checkpoints/checkpoint_002_polymarket_public_sources.md" +) +DEFAULT_CHECKPOINT_MANIFEST = Path( + "data/manifests/checkpoint_002_polymarket_public_sources.json" +) + +OFFICIAL_SOURCES = [ + { + "name": "Fetching Markets", + "url": "https://docs.polymarket.com/market-data/fetching-markets.md", + "finding": "Use Gamma events with active=true&closed=false for active market discovery; events contain markets.", + }, + { + "name": "List markets", + "url": "https://docs.polymarket.com/api-reference/markets/list-markets.md", + "finding": "Gamma /markets supports active/closed, slug, tag_id, condition_ids, clob_token_ids, end_date, limit, offset, and sorting parameters.", + }, + { + "name": "Public search", + "url": "https://docs.polymarket.com/api-reference/search/search-markets-events-and-profiles.md", + "finding": "Gamma /public-search supports q, events_status, limit_per_type, search_tags, recurrence, and tag filters.", + }, + { + "name": "Get order book", + "url": "https://docs.polymarket.com/api-reference/market-data/get-order-book.md", + "finding": "CLOB GET /book takes token_id and returns an order book summary.", + }, + { + "name": "Get order books", + "url": "https://docs.polymarket.com/api-reference/market-data/get-order-books-request-body.md", + "finding": "CLOB POST /books takes an array of token_id objects and returns multiple book summaries.", + }, + { + "name": "Market websocket", + "url": "https://docs.polymarket.com/market-data/websocket/market-channel.md", + "finding": "Public websocket supports market subscriptions by outcome token asset IDs.", + }, + { + "name": "Recent trades", + "url": "https://docs.polymarket.com/api-reference/core/get-trades-for-a-user-or-markets.md", + "finding": "Data API GET /trades is public and can filter by condition ID in the market query parameter.", + }, + { + "name": "Authenticated CLOB trades", + "url": "https://docs.polymarket.com/api-reference/trade/get-trades.md", + "finding": "CLOB GET /trades exists but requires API-key authentication, so it is not used for this public-data checkpoint.", + }, + { + "name": "Rate limits", + "url": "https://docs.polymarket.com/api-reference/rate-limits.md", + "finding": "Official rate limits are documented for Gamma, Data API, CLOB market data, and websocket-adjacent endpoints.", + }, +] + +DOCUMENTED_RATE_LIMITS = { + "gamma": { + "base_url": GAMMA_BASE, + "general": "4,000 req / 10s", + "/events": "500 req / 10s", + "/markets": "300 req / 10s", + "/markets + /events listing": "900 req / 10s", + "/public-search": "350 req / 10s", + }, + "data_api": { + "base_url": DATA_API_BASE, + "general": "1,000 req / 10s", + "/trades": "200 req / 10s", + }, + "clob": { + "base_url": CLOB_BASE, + "general": "9,000 req / 10s", + "/book": "1,500 req / 10s", + "/books": "500 req / 10s", + "/price": "1,500 req / 10s", + "/prices": "500 req / 10s", + "/midpoint": "1,500 req / 10s", + "/midpoints": "500 req / 10s", + "/prices-history": "1,000 req / 10s", + }, +} + +SAFE_RESPONSE_HEADERS = { + "age", + "cache-control", + "cf-cache-status", + "cf-ray", + "content-encoding", + "content-length", + "content-type", + "date", + "expires", + "last-modified", + "ratelimit-limit", + "ratelimit-remaining", + "ratelimit-reset", + "retry-after", + "server", + "strict-transport-security", + "x-ratelimit-limit", + "x-ratelimit-remaining", + "x-ratelimit-reset", +} + + +def utc_now() -> dt.datetime: + return dt.datetime.now(dt.UTC) + + +def iso_z(value: dt.datetime | None = None) -> str: + value = value or utc_now() + return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def parse_iso(value: Any) -> dt.datetime | None: + if not value or not isinstance(value, str): + return None + text = value.strip() + if not text: + return None + if text.endswith("Z"): + text = text[:-1] + "+00:00" + try: + parsed = dt.datetime.fromisoformat(text) + except ValueError: + return None + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=dt.UTC) + return parsed.astimezone(dt.UTC) + + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def normalize_params(params: dict[str, Any] | None) -> dict[str, Any] | None: + if not params: + return None + normalized: dict[str, Any] = {} + for key, value in params.items(): + if isinstance(value, bool): + normalized[key] = "true" if value else "false" + elif isinstance(value, list): + normalized[key] = [ + "true" if item is True else "false" if item is False else item + for item in value + ] + else: + normalized[key] = value + return normalized + + +def filter_headers(headers: Any) -> dict[str, str]: + safe: dict[str, str] = {} + for key, value in dict(headers).items(): + lower = key.lower() + if lower in SAFE_RESPONSE_HEADERS: + safe[key] = value + return safe + + +def rate_limit_headers(headers: dict[str, str]) -> dict[str, str]: + result: dict[str, str] = {} + for key, value in headers.items(): + lower = key.lower() + if "ratelimit" in lower or lower == "retry-after": + result[key] = value + return result + + +def decode_json_maybe(text: str) -> tuple[Any | None, str | None]: + try: + return json.loads(text), None + except json.JSONDecodeError as exc: + return None, str(exc) + + +def encode_url(url: str, params: dict[str, Any] | None = None) -> str: + params = normalize_params(params) + if not params: + return url + query = urllib.parse.urlencode(params, doseq=True) + separator = "&" if urllib.parse.urlparse(url).query else "?" + return f"{url}{separator}{query}" + + +def http_json_request( + name: str, + method: str, + url: str, + *, + params: dict[str, Any] | None = None, + json_body: Any | None = None, + timeout_seconds: float = 15.0, +) -> dict[str, Any]: + started_monotonic = time.monotonic() + started_at = iso_z() + full_url = encode_url(url, params) + headers = { + "Accept": "application/json", + "User-Agent": "orderbooks-checkpoint-2-probe/1.0", + } + data = None + if json_body is not None: + data = json.dumps(json_body, separators=(",", ":")).encode("utf-8") + headers["Content-Type"] = "application/json" + request = urllib.request.Request( + full_url, + data=data, + headers=headers, + method=method.upper(), + ) + + status_code: int | None = None + response_headers: dict[str, str] = {} + response_text = "" + error: str | None = None + try: + with urllib.request.urlopen(request, timeout=timeout_seconds) as response: + status_code = response.status + response_headers = filter_headers(response.headers) + response_text = response.read().decode("utf-8", errors="replace") + except urllib.error.HTTPError as exc: + status_code = exc.code + response_headers = filter_headers(exc.headers) + response_text = exc.read().decode("utf-8", errors="replace") + error = f"HTTPError: {exc}" + except Exception as exc: # noqa: BLE001 - preserve probe failure evidence + error = f"{type(exc).__name__}: {exc}" + + duration_ms = round((time.monotonic() - started_monotonic) * 1000, 3) + parsed_json, json_error = decode_json_maybe(response_text) if response_text else (None, None) + + return { + "name": name, + "started_at_utc": started_at, + "ended_at_utc": iso_z(), + "duration_ms": duration_ms, + "request": { + "method": method.upper(), + "url": url, + "full_url": full_url, + "params": normalize_params(params), + "json_body": json_body, + }, + "response": { + "status_code": status_code, + "headers": response_headers, + "observed_rate_limit_headers": rate_limit_headers(response_headers), + "json": parsed_json, + "json_error": json_error, + "text_preview": response_text[:1000] if parsed_json is None else None, + }, + "ok": error is None and status_code is not None and 200 <= status_code < 300, + "error": error, + } + + +def coerce_json_array(value: Any) -> list[Any]: + if isinstance(value, list): + return value + if isinstance(value, str): + try: + parsed = json.loads(value) + except json.JSONDecodeError: + return [] + return parsed if isinstance(parsed, list) else [] + return [] + + +def first_market(event: dict[str, Any]) -> dict[str, Any] | None: + markets = event.get("markets") + if isinstance(markets, list) and markets and isinstance(markets[0], dict): + return markets[0] + return None + + +def event_matches_btc_up_down(event: dict[str, Any]) -> bool: + market = first_market(event) or {} + outcomes = [str(item).lower() for item in coerce_json_array(market.get("outcomes"))] + text = " ".join( + str(event.get(key, "") or "") + for key in ("title", "slug", "ticker", "seriesSlug", "description") + ).lower() + tag_text = " ".join( + str(tag.get("slug", "") or tag.get("label", "") or "") + for tag in (event.get("tags") or []) + if isinstance(tag, dict) + ).lower() + series_slug = str(event.get("seriesSlug") or "").lower() + has_btc_text = "bitcoin" in text or "btc" in text or "bitcoin" in tag_text + has_up_down_text = ("up" in text and "down" in text) or "up-or-down" in tag_text + has_up_down_outcomes = set(outcomes) == {"up", "down"} + return bool( + (series_slug.startswith("btc-up-or-down") or (has_btc_text and has_up_down_text)) + and has_up_down_outcomes + ) + + +def candidate_record(event: dict[str, Any], now: dt.datetime, min_lead_seconds: int) -> dict[str, Any] | None: + market = first_market(event) + if not market: + return None + token_ids = [str(item) for item in coerce_json_array(market.get("clobTokenIds"))] + outcomes = [str(item) for item in coerce_json_array(market.get("outcomes"))] + if len(token_ids) < 1 or not outcomes: + return None + event_end = parse_iso(event.get("endDate")) + market_end = parse_iso(market.get("endDate")) + candidate_end = market_end or event_end + has_future_lead = bool(candidate_end and candidate_end >= now + dt.timedelta(seconds=min_lead_seconds)) + accepting_orders = market.get("acceptingOrders") is True + active_open = ( + event.get("active") is True + and event.get("closed") is False + and market.get("active") is True + and market.get("closed") is False + ) + score = 0 + score += 1000 if accepting_orders else 0 + score += 500 if has_future_lead else 0 + score += 200 if active_open else 0 + score += 100 if str(event.get("seriesSlug") or "").startswith("btc-up-or-down") else 0 + score += 50 if len(token_ids) >= 2 and len(outcomes) >= 2 else 0 + if not active_open: + score -= 1000 + if candidate_end and candidate_end < now: + score -= 250 + return { + "score": score, + "event": event, + "market": market, + "event_end_utc": iso_z(event_end) if event_end else None, + "market_end_utc": iso_z(market_end) if market_end else None, + "has_future_lead": has_future_lead, + "accepting_orders": accepting_orders, + "active_open": active_open, + "token_ids": token_ids, + "outcomes": outcomes, + } + + +def select_btc_up_down_market( + events: list[dict[str, Any]], now: dt.datetime, min_lead_seconds: int +) -> tuple[dict[str, Any] | None, list[dict[str, Any]]]: + candidates = [] + for event in events: + if not isinstance(event, dict) or not event_matches_btc_up_down(event): + continue + record = candidate_record(event, now, min_lead_seconds) + if record: + candidates.append(record) + + def sort_key(record: dict[str, Any]) -> tuple[int, float]: + end = parse_iso(record["market"].get("endDate")) or parse_iso(record["event"].get("endDate")) + end_ts = end.timestamp() if end else float("inf") + return (-int(record["score"]), end_ts) + + candidates.sort(key=sort_key) + summarized = [ + summarize_candidate(candidate, include_tokens=False) for candidate in candidates[:20] + ] + return (candidates[0] if candidates else None), summarized + + +def summarize_candidate(candidate: dict[str, Any], include_tokens: bool = True) -> dict[str, Any]: + event = candidate["event"] + market = candidate["market"] + summary = { + "score": candidate["score"], + "event_id": event.get("id"), + "event_slug": event.get("slug"), + "event_title": event.get("title"), + "event_end_utc": candidate.get("event_end_utc"), + "event_active": event.get("active"), + "event_closed": event.get("closed"), + "series_slug": event.get("seriesSlug"), + "market_id": market.get("id"), + "market_slug": market.get("slug"), + "condition_id": market.get("conditionId"), + "market_end_utc": candidate.get("market_end_utc"), + "market_active": market.get("active"), + "market_closed": market.get("closed"), + "accepting_orders": market.get("acceptingOrders"), + "enable_order_book": market.get("enableOrderBook"), + "outcomes": candidate.get("outcomes"), + "has_future_lead": candidate.get("has_future_lead"), + } + if include_tokens: + summary["clob_token_ids"] = candidate.get("token_ids") + return summary + + +def field_names(payload: Any) -> list[str]: + keys: set[str] = set() + if isinstance(payload, dict): + keys.update(str(key) for key in payload.keys()) + elif isinstance(payload, list): + for item in payload[:10]: + if isinstance(item, dict): + keys.update(str(key) for key in item.keys()) + return sorted(keys) + + +def nested_field_names(payload: Any, key: str) -> list[str]: + values: list[Any] = [] + if isinstance(payload, dict): + candidate = payload.get(key) + if isinstance(candidate, list): + values.extend(candidate[:10]) + elif isinstance(payload, list): + for item in payload[:10]: + if isinstance(item, dict) and isinstance(item.get(key), list): + values.extend(item[key][:10]) + keys: set[str] = set() + for item in values: + if isinstance(item, dict): + keys.update(str(field) for field in item.keys()) + return sorted(keys) + + +def send_ws_frame(sock: ssl.SSLSocket, opcode: int, payload: bytes) -> None: + mask = os.urandom(4) + header = bytearray([0x80 | opcode]) + length = len(payload) + if length < 126: + header.append(0x80 | length) + elif length < 65536: + header.append(0x80 | 126) + header.extend(struct.pack("!H", length)) + else: + header.append(0x80 | 127) + header.extend(struct.pack("!Q", length)) + masked = bytes(byte ^ mask[index % 4] for index, byte in enumerate(payload)) + sock.sendall(header + mask + masked) + + +def read_exact(sock: ssl.SSLSocket, length: int) -> bytes: + data = bytearray() + while len(data) < length: + chunk = sock.recv(length - len(data)) + if not chunk: + raise EOFError("websocket connection closed while reading frame") + data.extend(chunk) + return bytes(data) + + +def read_ws_frame(sock: ssl.SSLSocket) -> tuple[int, bytes]: + first, second = read_exact(sock, 2) + opcode = first & 0x0F + length = second & 0x7F + masked = bool(second & 0x80) + if length == 126: + length = struct.unpack("!H", read_exact(sock, 2))[0] + elif length == 127: + length = struct.unpack("!Q", read_exact(sock, 8))[0] + mask = read_exact(sock, 4) if masked else b"" + payload = read_exact(sock, length) if length else b"" + if masked: + payload = bytes(byte ^ mask[index % 4] for index, byte in enumerate(payload)) + return opcode, payload + + +def parse_ws_headers(raw_headers: str) -> tuple[str, dict[str, str]]: + lines = raw_headers.split("\r\n") + status_line = lines[0] if lines else "" + headers: dict[str, str] = {} + for line in lines[1:]: + if ":" not in line: + continue + key, value = line.split(":", 1) + headers[key.strip()] = value.strip() + return status_line, filter_headers(headers) + + +def classify_ws_payload(payload: Any) -> list[str]: + event_types: list[str] = [] + items = payload if isinstance(payload, list) else [payload] + for item in items: + if not isinstance(item, dict): + continue + event_type = item.get("event_type") + if event_type: + event_types.append(str(event_type)) + elif {"market", "asset_id", "bids", "asks", "timestamp"}.issubset(item.keys()): + event_types.append("book_without_event_type") + else: + event_types.append("unknown_object") + return event_types + + +def websocket_probe( + url: str, + token_ids: list[str], + *, + timeout_seconds: float, + max_messages: int, +) -> dict[str, Any]: + started_monotonic = time.monotonic() + started_at = iso_z() + parsed = urllib.parse.urlparse(url) + host = parsed.hostname + if not host: + return {"ok": False, "error": "missing websocket host"} + port = parsed.port or 443 + path = parsed.path or "/" + if parsed.query: + path = f"{path}?{parsed.query}" + subscription = { + "assets_ids": token_ids, + "type": "market", + "custom_feature_enabled": True, + } + result: dict[str, Any] = { + "name": "clob_market_websocket", + "started_at_utc": started_at, + "request": { + "url": url, + "subscription": subscription, + "max_messages": max_messages, + "timeout_seconds": timeout_seconds, + }, + "handshake": {}, + "messages": [], + "message_event_types": [], + "ok": False, + "error": None, + } + sock: ssl.SSLSocket | None = None + try: + raw_sock = socket.create_connection((host, port), timeout=timeout_seconds) + sock = ssl.create_default_context().wrap_socket(raw_sock, server_hostname=host) + sock.settimeout(timeout_seconds) + key = base64.b64encode(os.urandom(16)).decode("ascii") + request = ( + f"GET {path} HTTP/1.1\r\n" + f"Host: {host}\r\n" + "Upgrade: websocket\r\n" + "Connection: Upgrade\r\n" + f"Sec-WebSocket-Key: {key}\r\n" + "Sec-WebSocket-Version: 13\r\n" + "User-Agent: orderbooks-checkpoint-2-probe/1.0\r\n" + "\r\n" + ) + sock.sendall(request.encode("ascii")) + raw_headers = bytearray() + while b"\r\n\r\n" not in raw_headers: + raw_headers.extend(sock.recv(4096)) + if len(raw_headers) > 65536: + raise ValueError("websocket handshake headers exceeded 64 KiB") + header_text = bytes(raw_headers).split(b"\r\n\r\n", 1)[0].decode( + "iso-8859-1", errors="replace" + ) + status_line, response_headers = parse_ws_headers(header_text) + result["handshake"] = { + "status_line": status_line, + "headers": response_headers, + "observed_rate_limit_headers": rate_limit_headers(response_headers), + } + if " 101 " not in status_line: + raise ValueError(f"websocket upgrade failed: {status_line}") + + send_ws_frame(sock, 0x1, json.dumps(subscription).encode("utf-8")) + deadline = time.monotonic() + timeout_seconds + while time.monotonic() < deadline and len(result["messages"]) < max_messages: + remaining = max(0.1, deadline - time.monotonic()) + sock.settimeout(remaining) + opcode, payload_bytes = read_ws_frame(sock) + if opcode == 0x8: + result["messages"].append({"opcode": opcode, "close": True}) + break + if opcode == 0x9: + send_ws_frame(sock, 0xA, payload_bytes) + continue + if opcode != 0x1: + result["messages"].append( + {"opcode": opcode, "payload_length_bytes": len(payload_bytes)} + ) + continue + text = payload_bytes.decode("utf-8", errors="replace") + parsed_payload, json_error = decode_json_maybe(text) + event_types = classify_ws_payload(parsed_payload) + result["message_event_types"].extend(event_types) + result["messages"].append( + { + "opcode": opcode, + "payload_length_bytes": len(payload_bytes), + "json": parsed_payload, + "json_error": json_error, + "event_types": event_types, + "text_preview": None if parsed_payload is not None else text[:1000], + } + ) + result["ok"] = bool(result["messages"]) + except Exception as exc: # noqa: BLE001 - preserve probe failure evidence + result["error"] = f"{type(exc).__name__}: {exc}" + finally: + if sock is not None: + try: + send_ws_frame(sock, 0x8, b"") + except Exception: + pass + try: + sock.close() + except Exception: + pass + result["ended_at_utc"] = iso_z() + result["duration_ms"] = round((time.monotonic() - started_monotonic) * 1000, 3) + result["message_event_types"] = sorted(set(result["message_event_types"])) + return result + + +def request_json_payload(record: dict[str, Any]) -> Any: + return (record.get("response") or {}).get("json") + + +def top_level_field_summary(requests: dict[str, dict[str, Any]], websocket: dict[str, Any]) -> dict[str, Any]: + summary: dict[str, Any] = {} + for name, record in requests.items(): + payload = request_json_payload(record) + summary[name] = { + "top_level_fields": field_names(payload), + "bid_ask_level_fields": sorted( + set(nested_field_names(payload, "bids") + nested_field_names(payload, "asks")) + ), + } + ws_fields: set[str] = set() + ws_level_fields: set[str] = set() + for message in websocket.get("messages", []): + payload = message.get("json") + ws_fields.update(field_names(payload)) + ws_level_fields.update(nested_field_names(payload, "bids")) + ws_level_fields.update(nested_field_names(payload, "asks")) + if isinstance(payload, dict) and isinstance(payload.get("price_changes"), list): + for change in payload["price_changes"][:10]: + if isinstance(change, dict): + ws_level_fields.update(str(key) for key in change.keys()) + summary["clob_market_websocket"] = { + "top_level_fields": sorted(ws_fields), + "nested_level_or_change_fields": sorted(ws_level_fields), + "event_types_observed": websocket.get("message_event_types", []), + } + return summary + + +def build_endpoint_findings( + selected: dict[str, Any], + requests: dict[str, dict[str, Any]], + websocket: dict[str, Any], +) -> dict[str, Any]: + market = selected["market"] + outcomes = selected["outcomes"] + token_ids = selected["token_ids"] + outcome_tokens = [ + {"outcome": outcome, "token_id": token_ids[index] if index < len(token_ids) else None} + for index, outcome in enumerate(outcomes) + ] + book_payload = request_json_payload(requests["clob_get_book"]) + books_payload = request_json_payload(requests["clob_post_books"]) + trades_payload = request_json_payload(requests["data_api_recent_trades"]) + + return { + "active_market_discovery": { + "endpoint": f"{GAMMA_BASE}/events", + "method": "GET", + "params_used": requests["gamma_events_bitcoin_tag"]["request"]["params"], + "answer": "Use Gamma /events with active=true&closed=false and pagination; events include their markets. /markets can fetch individual market records by slug, condition_ids, or clob_token_ids.", + }, + "btc_up_down_filtering": { + "answer": "Filter Gamma events/markets by Bitcoin tag evidence, seriesSlug beginning btc-up-or-down, text containing BTC/Bitcoin plus Up/Down, and market outcomes exactly ['Up', 'Down'].", + "source_fields": [ + "event.seriesSlug", + "event.tags[].slug or label", + "event.title", + "event.slug", + "market.outcomes", + "market.clobTokenIds", + ], + }, + "condition_and_token_resolution": { + "condition_id_field": "market.conditionId", + "outcomes_field": "market.outcomes", + "token_ids_field": "market.clobTokenIds", + "mapping_rule": "Parse outcomes and clobTokenIds as arrays and map by index.", + "selected_condition_id": market.get("conditionId"), + "selected_outcome_tokens": outcome_tokens, + }, + "single_order_book": { + "endpoint": f"{CLOB_BASE}/book", + "method": "GET", + "params": {"token_id": token_ids[0]}, + "status_code": requests["clob_get_book"]["response"]["status_code"], + "field_count": len(field_names(book_payload)), + "bid_levels": len(book_payload.get("bids", [])) if isinstance(book_payload, dict) else None, + "ask_levels": len(book_payload.get("asks", [])) if isinstance(book_payload, dict) else None, + }, + "batch_order_books": { + "endpoint": f"{CLOB_BASE}/books", + "method": "POST", + "json_body_shape": [{"token_id": ""}], + "status_code": requests["clob_post_books"]["response"]["status_code"], + "book_count": len(books_payload) if isinstance(books_payload, list) else None, + }, + "market_websocket": { + "endpoint": MARKET_WS_URL, + "subscription_shape": { + "assets_ids": ["", ""], + "type": "market", + "custom_feature_enabled": True, + }, + "probe_ok": websocket.get("ok"), + "message_count": len(websocket.get("messages", [])), + "event_types_observed": websocket.get("message_event_types", []), + "note": "Initial observed snapshot may arrive as a JSON array of book objects without event_type, followed by event-typed updates.", + }, + "trades": { + "public_recent_trades_endpoint": f"{DATA_API_BASE}/trades", + "public_recent_trades_method": "GET", + "params": {"market": market.get("conditionId"), "limit": 10, "offset": 0}, + "public_recent_trades_status_code": requests["data_api_recent_trades"]["response"]["status_code"], + "public_recent_trade_count": len(trades_payload) if isinstance(trades_payload, list) else None, + "websocket_trade_event": "market websocket documents and can emit last_trade_price for subscribed assets", + "excluded_endpoint": "CLOB GET /trades requires readonly or level 2 API key authentication and was not used.", + }, + "rate_limits": { + "documented": DOCUMENTED_RATE_LIMITS, + "observed_headers": { + name: record["response"].get("observed_rate_limit_headers", {}) + for name, record in requests.items() + } + | { + "clob_market_websocket": websocket.get("handshake", {}).get( + "observed_rate_limit_headers", {} + ) + }, + "observed_note": "The bounded probe did not intentionally approach limits; absence of rate-limit headers is not a limit test.", + }, + "timestamps": { + "gamma_metadata": [ + "startDate", + "creationDate", + "endDate", + "createdAt", + "updatedAt", + "acceptingOrdersTimestamp", + "eventStartTime", + ], + "clob_book": "timestamp string in order-book payload; observed as Unix epoch milliseconds.", + "market_websocket": "timestamp string in websocket book/price/trade updates; observed as Unix epoch milliseconds.", + "data_api_trades": "timestamp integer in recent trade payload; observed as Unix epoch seconds.", + "probe_metadata": ["started_at_utc", "ended_at_utc", "duration_ms"], + }, + } + + +def markdown_table_row(values: list[Any]) -> str: + return "| " + " | ".join(str(value).replace("\n", " ") for value in values) + " |" + + +def write_probe_markdown(probe: dict[str, Any], path: Path) -> None: + selected = probe["selected_market"] + gate = probe["gate"] + endpoint_findings = probe["endpoint_findings"] + validation = probe["validation_summary"] + lines = [ + "# Polymarket Public Sources Probe v1", + "", + f"Artifact status: `{probe['artifact_status']}`", + "", + "## Gate", + "", + f"Status: `{gate['status']}`", + "", + gate["reason"], + "", + "## Scope", + "", + "Bounded public endpoint probe only. No collector, no trading, no private endpoints, no secrets.", + "", + "## Selected Market", + "", + markdown_table_row(["Field", "Value"]), + markdown_table_row(["---", "---"]), + markdown_table_row(["event_slug", selected.get("event_slug")]), + markdown_table_row(["event_title", selected.get("event_title")]), + markdown_table_row(["series_slug", selected.get("series_slug")]), + markdown_table_row(["market_id", selected.get("market_id")]), + markdown_table_row(["condition_id", selected.get("condition_id")]), + markdown_table_row(["market_end_utc", selected.get("market_end_utc")]), + markdown_table_row(["accepting_orders", selected.get("accepting_orders")]), + markdown_table_row(["outcomes", json.dumps(selected.get("outcomes"))]), + markdown_table_row(["clob_token_ids", json.dumps(selected.get("clob_token_ids"))]), + "", + "## Questions Answered", + "", + markdown_table_row(["Question", "Answer"]), + markdown_table_row(["---", "---"]), + markdown_table_row( + [ + "How are active markets discovered?", + endpoint_findings["active_market_discovery"]["answer"], + ] + ), + markdown_table_row( + [ + "How can BTC up/down markets be filtered?", + endpoint_findings["btc_up_down_filtering"]["answer"], + ] + ), + markdown_table_row( + [ + "How are conditionId and token IDs resolved?", + endpoint_findings["condition_and_token_resolution"]["mapping_rule"], + ] + ), + markdown_table_row( + [ + "How is the current order book fetched?", + f"GET {CLOB_BASE}/book?token_id=", + ] + ), + markdown_table_row( + [ + "Is there a batch order-book endpoint?", + f"Yes: POST {CLOB_BASE}/books with an array of token_id objects.", + ] + ), + markdown_table_row( + [ + "Is there a market websocket?", + f"Yes: {MARKET_WS_URL}; bounded probe ok={endpoint_findings['market_websocket']['probe_ok']}.", + ] + ), + markdown_table_row( + [ + "Is there a trade websocket or recent trades endpoint?", + f"Market websocket can emit last_trade_price; public recent trades are at GET {DATA_API_BASE}/trades?market=.", + ] + ), + markdown_table_row( + [ + "What rate limits are documented or observed?", + "Official docs list Gamma, Data API, and CLOB limits; this bounded probe observed no Retry-After or rate-limit headers.", + ] + ), + markdown_table_row( + [ + "What fields are returned?", + "See field summary in the JSON artifact; key fields include conditionId, outcomes, clobTokenIds, bids, asks, timestamp, hash, price, size.", + ] + ), + markdown_table_row( + [ + "What timestamps exist?", + "Gamma ISO date fields, CLOB/websocket epoch-millisecond strings, Data API trade epoch seconds, and probe request timestamps.", + ] + ), + "", + "## Endpoint Evidence", + "", + markdown_table_row(["Name", "Method", "URL", "Status", "Duration ms"]), + markdown_table_row(["---", "---", "---", "---", "---"]), + ] + for name, record in probe["requests"].items(): + request = record["request"] + response = record["response"] + lines.append( + markdown_table_row( + [ + name, + request["method"], + request["full_url"], + response["status_code"], + record["duration_ms"], + ] + ) + ) + ws = probe["websocket_probe"] + lines.extend( + [ + markdown_table_row( + [ + "clob_market_websocket", + "WSS", + MARKET_WS_URL, + ws.get("handshake", {}).get("status_line"), + ws.get("duration_ms"), + ] + ), + "", + "## Field Summary", + "", + "The full raw JSON payloads and websocket messages are preserved in the JSON probe artifact.", + "", + "```json", + json.dumps(probe["field_summary"], indent=2, sort_keys=True), + "```", + "", + "## Rate Limits", + "", + "Documented limits from official docs:", + "", + "```json", + json.dumps(DOCUMENTED_RATE_LIMITS, indent=2, sort_keys=True), + "```", + "", + "Observed rate-limit headers in this bounded run:", + "", + "```json", + json.dumps(endpoint_findings["rate_limits"]["observed_headers"], indent=2, sort_keys=True), + "```", + "", + "## Validation Evidence", + "", + markdown_table_row(["Check", "Result"]), + markdown_table_row(["---", "---"]), + markdown_table_row(["market_metadata_fetched", validation["market_metadata_fetched"]]), + markdown_table_row(["single_order_book_fetched", validation["single_order_book_fetched"]]), + markdown_table_row(["batch_order_books_fetched", validation["batch_order_books_fetched"]]), + markdown_table_row(["recent_trades_checked", validation["recent_trades_checked"]]), + markdown_table_row(["websocket_checked", validation["websocket_checked"]]), + "", + "## Official Sources", + "", + ] + ) + for source in OFFICIAL_SOURCES: + lines.append(f"- [{source['name']}]({source['url']}): {source['finding']}") + lines.extend( + [ + "", + "## Strongest Fake-Progress Risk", + "", + probe["fake_progress_risk"], + "", + "## Next Smallest Step", + "", + probe["next_step"], + "", + ] + ) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines), encoding="utf-8") + + +def write_checkpoint_report(probe: dict[str, Any], path: Path) -> None: + gate = probe["gate"] + files = [ + "scripts/probe_polymarket_public_sources.py", + str(DEFAULT_PROBE_JSON), + str(DEFAULT_PROBE_MD), + str(DEFAULT_CHECKPOINT_MANIFEST), + str(DEFAULT_CHECKPOINT_REPORT), + ] + lines = [ + "# Checkpoint 2: Polymarket Public Data Source Probe", + "", + f"Gate: {gate['status']}", + "", + f"Started at UTC: {probe['started_at_utc']}", + f"Ended at UTC: {probe['ended_at_utc']}", + "", + "## Scope", + "", + "Built a bounded public Polymarket source probe. Explicitly excluded collector implementation, polling, dashboards, databases, trading, order placement, wallet logic, private endpoints, and secrets.", + "", + "## Files Created Or Changed", + "", + markdown_table_row(["Path", "Kind", "Status"]), + markdown_table_row(["---", "---", "---"]), + ] + kind_by_path = { + "scripts/probe_polymarket_public_sources.py": "bounded probe script", + str(DEFAULT_PROBE_JSON): "raw probe evidence", + str(DEFAULT_PROBE_MD): "probe report", + str(DEFAULT_CHECKPOINT_MANIFEST): "checkpoint manifest", + str(DEFAULT_CHECKPOINT_REPORT): "checkpoint report", + } + for file_path in files: + lines.append(markdown_table_row([file_path, kind_by_path[file_path], "valid"])) + lines.extend( + [ + "", + "## Validation", + "", + "Commands run by the builder:", + "", + "```sh", + probe["command"], + "```", + "", + f"Result: {gate['status']} - {gate['reason']}", + "", + "Evidence summary:", + "", + markdown_table_row(["Evidence", "Result"]), + markdown_table_row(["---", "---"]), + markdown_table_row(["selected_condition_id", probe["selected_market"].get("condition_id")]), + markdown_table_row(["selected_tokens", json.dumps(probe["selected_market"].get("clob_token_ids"))]), + markdown_table_row(["GET /book status", probe["requests"]["clob_get_book"]["response"]["status_code"]]), + markdown_table_row(["POST /books status", probe["requests"]["clob_post_books"]["response"]["status_code"]]), + markdown_table_row(["GET /trades status", probe["requests"]["data_api_recent_trades"]["response"]["status_code"]]), + markdown_table_row(["websocket ok", probe["websocket_probe"].get("ok")]), + "", + "## Endpoint Findings", + "", + "- Active discovery: Gamma `GET /events?active=true&closed=false`, with pagination. Events include market records.", + "- BTC up/down filtering: Bitcoin tag plus `seriesSlug` beginning `btc-up-or-down`, text containing BTC/Bitcoin and Up/Down, and outcomes `Up`/`Down`.", + "- Token resolution: parse market `outcomes` and `clobTokenIds`, then map by index; condition ID is `conditionId`.", + "- Single book: CLOB `GET /book?token_id=`.", + "- Batch books: CLOB `POST /books` with `[{'token_id': ''}, ...]`.", + "- Market websocket: public `wss://ws-subscriptions-clob.polymarket.com/ws/market` subscription by `assets_ids`.", + "- Recent trades: public Data API `GET /trades?market=`; authenticated CLOB `GET /trades` was excluded.", + "", + "## Strongest Fake-Progress Risk", + "", + probe["fake_progress_risk"], + "", + "## Next Smallest Step", + "", + probe["next_step"], + "", + ] + ) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines), encoding="utf-8") + + +def write_checkpoint_manifest( + probe: dict[str, Any], + path: Path, + artifacts: list[dict[str, Any]], +) -> None: + manifest = { + "schema_name": "checkpoint_manifest", + "schema_version": 1, + "checkpoint_id": 2, + "checkpoint_name": "Polymarket Public Data Source Probe", + "status": probe["gate"]["status"], + "started_at_utc": probe["started_at_utc"], + "ended_at_utc": probe["ended_at_utc"], + "scope": "Bounded public endpoint probe only; no collector, trading, wallet, private endpoint, database, dashboard, or generic multi-market implementation.", + "artifacts": artifacts, + "validation": { + "commands": [ + { + "command": probe["command"], + "result": "exit_code_0" if probe["gate"]["status"] == "PASS" else "completed", + "summary": probe["gate"]["reason"], + } + ], + "summary": probe["validation_summary"], + }, + "decisions": [ + { + "decision": "Use Gamma events plus market records for discovery instead of adding a generic discovery framework.", + "reason": "Checkpoint 2 only needs source identification; Checkpoint 3 can turn this into a small BTC discovery script.", + }, + { + "decision": "Use public CLOB market-data endpoints and public Data API trades; exclude authenticated CLOB trade endpoints.", + "reason": "Project rules require public data only and no secrets.", + }, + ], + "assumptions": [ + "Gamma market outcomes and clobTokenIds arrays align by index.", + "CLOB/websocket order-book timestamps observed as epoch milliseconds should be preserved raw until later normalization confirms semantics.", + "Data API public trade timestamps observed as epoch seconds should be preserved raw until later normalization confirms semantics.", + ], + "fake_progress_risk": probe["fake_progress_risk"], + "next_step": probe["next_step"], + } + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def build_probe(args: argparse.Namespace) -> dict[str, Any]: + started_at = iso_z() + now = utc_now() + command = " ".join([Path(sys.argv[0]).as_posix(), *sys.argv[1:]]) or str(sys.argv[0]) + + requests: dict[str, dict[str, Any]] = {} + requests["gamma_events_bitcoin_tag"] = http_json_request( + "gamma_events_bitcoin_tag", + "GET", + f"{GAMMA_BASE}/events", + params={ + "tag_id": 235, + "related_tags": True, + "active": True, + "closed": False, + "limit": args.events_limit, + "order": "endDate", + "ascending": True, + }, + timeout_seconds=args.http_timeout, + ) + requests["gamma_public_search_btc_up_down"] = http_json_request( + "gamma_public_search_btc_up_down", + "GET", + f"{GAMMA_BASE}/public-search", + params={ + "q": "bitcoin up or down", + "events_status": "active", + "limit_per_type": args.search_limit, + "keep_closed_markets": 0, + "search_tags": True, + }, + timeout_seconds=args.http_timeout, + ) + + event_sources: list[dict[str, Any]] = [] + events_payload = request_json_payload(requests["gamma_events_bitcoin_tag"]) + if isinstance(events_payload, list): + event_sources.extend(events_payload) + search_payload = request_json_payload(requests["gamma_public_search_btc_up_down"]) + if isinstance(search_payload, dict) and isinstance(search_payload.get("events"), list): + event_sources.extend(search_payload["events"]) + + selected, candidates = select_btc_up_down_market( + event_sources, now, args.min_future_lead_seconds + ) + if selected is None: + raise RuntimeError("No BTC up/down market candidate with CLOB token IDs found") + + selected_summary = summarize_candidate(selected) + market_slug = selected["market"].get("slug") or selected["event"].get("slug") + requests["gamma_market_by_slug"] = http_json_request( + "gamma_market_by_slug", + "GET", + f"{GAMMA_BASE}/markets", + params={"slug": market_slug}, + timeout_seconds=args.http_timeout, + ) + + token_ids = selected["token_ids"] + condition_id = selected["market"].get("conditionId") + requests["clob_get_book"] = http_json_request( + "clob_get_book", + "GET", + f"{CLOB_BASE}/book", + params={"token_id": token_ids[0]}, + timeout_seconds=args.http_timeout, + ) + requests["clob_post_books"] = http_json_request( + "clob_post_books", + "POST", + f"{CLOB_BASE}/books", + json_body=[{"token_id": token_id} for token_id in token_ids[:2]], + timeout_seconds=args.http_timeout, + ) + requests["data_api_recent_trades"] = http_json_request( + "data_api_recent_trades", + "GET", + f"{DATA_API_BASE}/trades", + params={"market": condition_id, "limit": args.trades_limit, "offset": 0}, + timeout_seconds=args.http_timeout, + ) + + if args.skip_websocket: + ws_probe = { + "name": "clob_market_websocket", + "started_at_utc": iso_z(), + "ended_at_utc": iso_z(), + "duration_ms": 0, + "request": { + "url": MARKET_WS_URL, + "subscription": { + "assets_ids": token_ids[:2], + "type": "market", + "custom_feature_enabled": True, + }, + "max_messages": args.websocket_messages, + "timeout_seconds": args.websocket_timeout, + }, + "handshake": {}, + "messages": [], + "message_event_types": [], + "ok": False, + "error": "Skipped by --skip-websocket", + } + else: + ws_probe = websocket_probe( + MARKET_WS_URL, + token_ids[:2], + timeout_seconds=args.websocket_timeout, + max_messages=args.websocket_messages, + ) + + market_payload = request_json_payload(requests["gamma_market_by_slug"]) + book_payload = request_json_payload(requests["clob_get_book"]) + books_payload = request_json_payload(requests["clob_post_books"]) + trades_payload = request_json_payload(requests["data_api_recent_trades"]) + validation_summary = { + "market_metadata_fetched": bool( + requests["gamma_market_by_slug"]["ok"] + and isinstance(market_payload, list) + and len(market_payload) >= 1 + ), + "single_order_book_fetched": bool( + requests["clob_get_book"]["ok"] + and isinstance(book_payload, dict) + and book_payload.get("asset_id") + and isinstance(book_payload.get("bids"), list) + and isinstance(book_payload.get("asks"), list) + ), + "batch_order_books_fetched": bool( + requests["clob_post_books"]["ok"] + and isinstance(books_payload, list) + and len(books_payload) >= 1 + ), + "recent_trades_checked": bool( + requests["data_api_recent_trades"]["ok"] and isinstance(trades_payload, list) + ), + "websocket_checked": bool(ws_probe.get("ok")), + } + pass_condition_met = ( + validation_summary["market_metadata_fetched"] + and validation_summary["single_order_book_fetched"] + ) + gate_status = "PASS" if pass_condition_met else "FAIL" + reason = ( + "Fetched at least one active market metadata record and one current CLOB order book." + if pass_condition_met + else "Did not fetch both required active market metadata and current order book evidence." + ) + + probe: dict[str, Any] = { + "schema_name": "polymarket_public_sources_probe", + "schema_version": 1, + "artifact_status": "valid" if pass_condition_met else "partial", + "checkpoint_id": 2, + "checkpoint_name": "Polymarket Public Data Source Probe", + "started_at_utc": started_at, + "ended_at_utc": iso_z(), + "command": command, + "scope": "Bounded public endpoint probe only; no collector implementation.", + "official_sources": OFFICIAL_SOURCES, + "selected_market": selected_summary, + "candidate_markets_considered": candidates, + "requests": requests, + "websocket_probe": ws_probe, + "validation_summary": validation_summary, + "gate": {"status": gate_status, "reason": reason}, + "fake_progress_risk": "Mistaking one successful short probe for a reliable collector. This checkpoint only proves endpoint availability and payload shape at probe time.", + "next_step": "Checkpoint 3: build a small BTC market discovery script that reliably outputs current active BTC up/down markets with condition IDs and both outcome token IDs.", + } + probe["field_summary"] = top_level_field_summary(requests, ws_probe) + probe["endpoint_findings"] = build_endpoint_findings(selected, requests, ws_probe) + return probe + + +def write_outputs(args: argparse.Namespace, probe: dict[str, Any]) -> None: + args.output_json.parent.mkdir(parents=True, exist_ok=True) + args.output_json.write_text( + json.dumps(probe, indent=2, sort_keys=True) + "\n", encoding="utf-8" + ) + write_probe_markdown(probe, args.output_markdown) + write_checkpoint_report(probe, args.checkpoint_report) + + artifact_paths = [ + ("scripts/probe_polymarket_public_sources.py", "bounded_probe_script"), + (args.output_json.as_posix(), "raw_probe_evidence"), + (args.output_markdown.as_posix(), "probe_report"), + (args.checkpoint_report.as_posix(), "checkpoint_report"), + ] + artifacts = [] + for artifact_path, kind in artifact_paths: + path = Path(artifact_path) + artifacts.append( + { + "path": artifact_path, + "kind": kind, + "status": "valid" if path.exists() and path.stat().st_size > 0 else "missing", + "sha256": sha256_file(path) if path.exists() and path.is_file() else None, + } + ) + artifacts.append( + { + "path": args.checkpoint_manifest.as_posix(), + "kind": "checkpoint_manifest", + "status": "valid", + } + ) + write_checkpoint_manifest(probe, args.checkpoint_manifest, artifacts) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Probe public Polymarket data sources for Checkpoint 2." + ) + parser.add_argument("--output-json", type=Path, default=DEFAULT_PROBE_JSON) + parser.add_argument("--output-markdown", type=Path, default=DEFAULT_PROBE_MD) + parser.add_argument("--checkpoint-report", type=Path, default=DEFAULT_CHECKPOINT_REPORT) + parser.add_argument( + "--checkpoint-manifest", type=Path, default=DEFAULT_CHECKPOINT_MANIFEST + ) + parser.add_argument("--events-limit", type=int, default=100) + parser.add_argument("--search-limit", type=int, default=20) + parser.add_argument("--trades-limit", type=int, default=10) + parser.add_argument("--http-timeout", type=float, default=15.0) + parser.add_argument("--websocket-timeout", type=float, default=8.0) + parser.add_argument("--websocket-messages", type=int, default=3) + parser.add_argument("--min-future-lead-seconds", type=int, default=60) + parser.add_argument("--skip-websocket", action="store_true") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + probe = build_probe(args) + write_outputs(args, probe) + print( + json.dumps( + { + "status": probe["gate"]["status"], + "output_json": args.output_json.as_posix(), + "output_markdown": args.output_markdown.as_posix(), + "checkpoint_report": args.checkpoint_report.as_posix(), + "checkpoint_manifest": args.checkpoint_manifest.as_posix(), + "selected_market": probe["selected_market"], + "validation_summary": probe["validation_summary"], + }, + indent=2, + sort_keys=True, + ) + ) + return 0 if probe["gate"]["status"] == "PASS" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_polymarket_24h_soak.sh b/scripts/run_polymarket_24h_soak.sh new file mode 100755 index 0000000..db1da7a --- /dev/null +++ b/scripts/run_polymarket_24h_soak.sh @@ -0,0 +1,362 @@ +#!/usr/bin/env bash +set -uo pipefail + +APP_DIR="${ORDERBOOKS_APP_DIR:-$(pwd)}" +PYTHON_BIN="${ORDERBOOKS_PYTHON:-python3}" +RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-/usr/bin/rclone}" +RCLONE_DEST_BASE="${ORDERBOOKS_RCLONE_DEST:-gdrive:orderbooks/polymarket/soak-test}" + +SOAK_DATE="${ORDERBOOKS_SOAK_DATE:-$(date -u +%F)}" +SOAK_ID="${ORDERBOOKS_SOAK_ID:-soak_test_${SOAK_DATE}}" +SOAK_SECONDS="${ORDERBOOKS_SOAK_SECONDS:-86400}" +CYCLE_SECONDS="${ORDERBOOKS_SOAK_CYCLE_SECONDS:-300}" +INTERVAL_SECONDS="${ORDERBOOKS_SOAK_INTERVAL_SECONDS:-30}" +MARKET_LIMIT="${ORDERBOOKS_SOAK_MARKET_LIMIT:-2}" +MARKET_END_SAFETY_SECONDS="${ORDERBOOKS_SOAK_MARKET_END_SAFETY_SECONDS:-420}" +REQUEST_TIMEOUT_SECONDS="${ORDERBOOKS_SOAK_REQUEST_TIMEOUT_SECONDS:-15}" +MAX_RETRIES="${ORDERBOOKS_SOAK_MAX_RETRIES:-2}" +BACKOFF_SECONDS="${ORDERBOOKS_SOAK_BACKOFF_SECONDS:-2}" +DISCOVERY_LIMIT="${ORDERBOOKS_SOAK_DISCOVERY_LIMIT:-100}" +DISCOVERY_MAX_PAGES="${ORDERBOOKS_SOAK_DISCOVERY_MAX_PAGES:-3}" +DISCOVERY_TIMEOUT="${ORDERBOOKS_SOAK_DISCOVERY_TIMEOUT:-15}" + +LOCAL_ROOT="${ORDERBOOKS_SOAK_LOCAL_ROOT:-data/soak_test/${SOAK_DATE}}" +MANIFEST_ROOT="${ORDERBOOKS_SOAK_MANIFEST_ROOT:-data/manifests/${SOAK_ID}}" +START_MANIFEST="${ORDERBOOKS_SOAK_START_MANIFEST:-data/manifests/${SOAK_ID}_start.json}" +FINAL_MANIFEST="${ORDERBOOKS_SOAK_FINAL_MANIFEST:-data/manifests/${SOAK_ID}_final.json}" + +DISCOVERY_DIR="${LOCAL_ROOT}/discovery" +LIVE_DIR="${LOCAL_ROOT}/live_sample" +LOG_DIR="${LOCAL_ROOT}/logs" +PID_FILE="${LOCAL_ROOT}/soak.pid" +CYCLES_JSONL="${MANIFEST_ROOT}/cycles.jsonl" +LOG_FILE="${LOG_DIR}/soak.log" +REMOTE_DEST="${RCLONE_DEST_BASE%/}/${SOAK_DATE}" + +STOP_REQUESTED=0 +STOP_SIGNAL="" +CURRENT_CHILD_PID="" +CURRENT_PHASE="initializing" +CURRENT_CYCLE_ID="" +START_WRITTEN=0 +FINAL_WRITTEN=0 + +cd "${APP_DIR}" || exit 2 +mkdir -p "${DISCOVERY_DIR}" "${LIVE_DIR}" "${LOG_DIR}" "${MANIFEST_ROOT}" "$(dirname "${START_MANIFEST}")" "$(dirname "${FINAL_MANIFEST}")" + +STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)" +START_EPOCH="$(date -u +%s)" +END_EPOCH="$((START_EPOCH + SOAK_SECONDS))" +EXPECTED_COMPLETION_AT="$(date -u -d "@${END_EPOCH}" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || python3 - <> "${LOG_FILE}" 2>/dev/null || true +} + +log() { + printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" | tee -a "${LOG_FILE}" +} + +handle_signal() { + local signal_name="$1" + STOP_REQUESTED=1 + STOP_SIGNAL="${signal_name}" + safe_log "SIGNAL received=${signal_name} phase=${CURRENT_PHASE} cycle_id=${CURRENT_CYCLE_ID:-none}" + if [[ -n "${CURRENT_CHILD_PID}" ]] && kill -0 "${CURRENT_CHILD_PID}" 2>/dev/null; then + case "${signal_name}" in + SIGINT) kill -INT "${CURRENT_CHILD_PID}" 2>/dev/null || true ;; + SIGTERM) kill -TERM "${CURRENT_CHILD_PID}" 2>/dev/null || true ;; + SIGHUP) kill -HUP "${CURRENT_CHILD_PID}" 2>/dev/null || true ;; + esac + fi +} + +write_start_manifest() { + local tmp_path="${START_MANIFEST}.tmp" + python3 - "$tmp_path" "$START_MANIFEST" <> "${CYCLES_JSONL}" +} + +write_final_manifest() { + local final_status="$1" + local gate_status="$2" + local exit_reason="$3" + local ended_at + local tmp_path + ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + tmp_path="${FINAL_MANIFEST}.tmp" + python3 - "$tmp_path" "$FINAL_MANIFEST" </dev/null)" == "$$" ]]; then + rm -f "${PID_FILE}" + fi + exit "${rc}" +} + +run_logged() { + "$@" >> "${LOG_FILE}" 2>&1 & + CURRENT_CHILD_PID="$!" + wait "${CURRENT_CHILD_PID}" + local rc=$? + if [[ "${STOP_REQUESTED}" -eq 1 ]] && kill -0 "${CURRENT_CHILD_PID}" 2>/dev/null; then + wait "${CURRENT_CHILD_PID}" + rc=$? + fi + CURRENT_CHILD_PID="" + return "${rc}" +} + +trap 'handle_signal SIGINT' INT +trap 'handle_signal SIGTERM' TERM +trap 'handle_signal SIGHUP' HUP +trap cleanup_on_exit EXIT + +echo "$$" > "${PID_FILE}" +write_start_manifest +test -s "${START_MANIFEST}" || exit 3 + +log "START soak_id=${SOAK_ID} pid=$$ expected_completion=${EXPECTED_COMPLETION_AT}" + +cycle_index=0 +error_seen=0 +while true; do + now_epoch="$(date -u +%s)" + remaining="$((END_EPOCH - now_epoch))" + if [[ "${remaining}" -le 0 ]]; then + break + fi + if [[ "${STOP_REQUESTED}" -eq 1 ]]; then + break + fi + if [[ "${remaining}" -lt 30 ]]; then + log "SKIP final tiny remaining window seconds=${remaining}" + break + fi + + cycle_index="$((cycle_index + 1))" + cycle_id="$(date -u +%Y%m%dT%H%M%SZ)" + CURRENT_CYCLE_ID="${cycle_id}" + run_seconds="${CYCLE_SECONDS}" + if [[ "${remaining}" -lt "${run_seconds}" ]]; then + run_seconds="${remaining}" + fi + + discovery_json="${DISCOVERY_DIR}/polymarket_btc_markets_${cycle_id}.json" + discovery_manifest="${DISCOVERY_DIR}/polymarket_btc_markets_manifest_${cycle_id}.json" + discovery_markdown="${DISCOVERY_DIR}/polymarket_btc_markets_${cycle_id}.md" + collector_manifest="${MANIFEST_ROOT}/collector_${cycle_id}.json" + upload_manifest="${MANIFEST_ROOT}/upload_${cycle_id}.json" + cycle_started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + + log "CYCLE ${cycle_index} start id=${cycle_id} run_seconds=${run_seconds}" + + discovery_exit=0 + CURRENT_PHASE="discovery" + run_logged "${PYTHON_BIN}" scripts/discover_polymarket_btc_markets.py \ + --output-json "${discovery_json}" \ + --manifest "${discovery_manifest}" \ + --markdown "${discovery_markdown}" \ + --limit "${DISCOVERY_LIMIT}" \ + --max-pages "${DISCOVERY_MAX_PAGES}" \ + --timeout "${DISCOVERY_TIMEOUT}" || discovery_exit=$? + + collector_exit=0 + if [[ "${STOP_REQUESTED}" -eq 1 ]]; then + collector_exit=98 + elif [[ "${discovery_exit}" -eq 0 ]]; then + CURRENT_PHASE="collector" + run_logged "${PYTHON_BIN}" scripts/collect_polymarket_orderbooks.py \ + --config config/polymarket_collector.vps.example.yaml \ + --discovery-path "${discovery_json}" \ + --output-dir "${LIVE_DIR}" \ + --manifest-path "${collector_manifest}" \ + --market-limit "${MARKET_LIMIT}" \ + --interval-seconds "${INTERVAL_SECONDS}" \ + --duration-seconds "${run_seconds}" \ + --request-timeout-seconds "${REQUEST_TIMEOUT_SECONDS}" \ + --max-retries "${MAX_RETRIES}" \ + --backoff-seconds "${BACKOFF_SECONDS}" \ + --market-end-safety-seconds "${MARKET_END_SAFETY_SECONDS}" || collector_exit=$? + else + collector_exit=99 + fi + + upload_exit=0 + if [[ "${STOP_REQUESTED}" -eq 1 ]]; then + upload_exit=98 + elif [[ "${collector_exit}" -eq 0 ]]; then + CURRENT_PHASE="upload" + run_logged scripts/upload_archive_rclone.sh \ + --execute \ + --data-dir "${LOCAL_ROOT}" \ + --raw-dir "${LIVE_DIR}" \ + --source-manifest-dir "${MANIFEST_ROOT}" \ + --manifest-dir "${MANIFEST_ROOT}" \ + --manifest-path "${upload_manifest}" \ + --dest "${REMOTE_DEST}" \ + --min-age-seconds 0 \ + --rclone-bin "${RCLONE_BIN}" || upload_exit=$? + else + upload_exit=99 + fi + + cycle_ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + if [[ "${STOP_REQUESTED}" -eq 1 ]]; then + cycle_status="INTERRUPTED" + elif [[ "${discovery_exit}" -eq 0 && "${collector_exit}" -eq 0 && "${upload_exit}" -eq 0 ]]; then + cycle_status="OK" + else + cycle_status="ERROR" + error_seen=1 + fi + + record="$(python3 - </dev/null 2>&1; then + kill -TERM "${CHILD_PID}" >/dev/null 2>&1 || true + fi +} + +trap request_stop INT TERM + +mkdir -p "${MANIFEST_DIR}" +cd "${APP_DIR}" || exit 1 + +echo "collector loop started at $(utc_iso)" + +while [[ "${STOP_REQUESTED}" -eq 0 ]]; do + cycle_started="$(utc_iso)" + echo "collector cycle starting at ${cycle_started}" + + /bin/bash scripts/run_polymarket_collector_cycle.sh & + CHILD_PID="$!" + wait "${CHILD_PID}" + cycle_exit="$?" + CHILD_PID="" + + if [[ "${STOP_REQUESTED}" -ne 0 ]]; then + write_loop_event "INTERRUPTED" "${cycle_exit}" "collector loop received stop request during or after cycle" + break + fi + + if [[ "${cycle_exit}" -ne 0 ]]; then + write_loop_event "CYCLE_FAILED" "${cycle_exit}" "collector cycle exited nonzero; loop will continue after sleep" + echo "collector cycle failed with exit ${cycle_exit}; continuing after ${LOOP_SLEEP_SECONDS}s" >&2 + else + echo "collector cycle completed at $(utc_iso)" + fi + + for ((i = 0; i < LOOP_SLEEP_SECONDS; i++)); do + if [[ "${STOP_REQUESTED}" -ne 0 ]]; then + break + fi + sleep 1 + done +done + +echo "collector loop stopped at $(utc_iso)" diff --git a/scripts/upload_archive_rclone.sh b/scripts/upload_archive_rclone.sh new file mode 100755 index 0000000..c43d9a7 --- /dev/null +++ b/scripts/upload_archive_rclone.sh @@ -0,0 +1,462 @@ +#!/usr/bin/env bash +set -uo pipefail + +SCRIPT_NAME="orderbooks_rclone_uploader" +SCRIPT_VERSION="0.1.0" + +MODE="dry-run" +CLEANUP_AFTER_VERIFY=0 +DATA_DIR="${ORDERBOOKS_UPLOAD_DATA_DIR:-${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}}" +RAW_DIR="${ORDERBOOKS_UPLOAD_RAW_DIR:-}" +SOURCE_MANIFEST_DIR="${ORDERBOOKS_UPLOAD_SOURCE_MANIFEST_DIR:-}" +MANIFEST_DIR="${ORDERBOOKS_UPLOAD_MANIFEST_DIR:-}" +MANIFEST_PATH="${ORDERBOOKS_UPLOAD_MANIFEST_PATH:-}" +DEST="${ORDERBOOKS_RCLONE_DEST:-}" +RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-rclone}" +MIN_AGE_SECONDS="${ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS:-600}" +RETENTION_DAYS="${ORDERBOOKS_UPLOAD_RETENTION_DAYS:-7}" +TRANSFERS="${ORDERBOOKS_RCLONE_TRANSFERS:-4}" +CHECKERS="${ORDERBOOKS_RCLONE_CHECKERS:-8}" + +usage() { + cat <<'EOF' +Usage: scripts/upload_archive_rclone.sh [options] + +Uploads closed raw collector archive files and manifests with rclone. +Default mode is dry-run. Real upload requires --execute and a destination. + +Options: + --dry-run Plan and run rclone copy with --dry-run (default). + --execute Run real rclone copy and rclone check. + --cleanup-after-verify Delete uploaded local files older than retention only after verification. + --data-dir DIR Base data directory. Default: /var/lib/orderbooks. + --raw-dir DIR Raw collector output directory. Default: DATA_DIR/raw_orderbooks. + --source-manifest-dir DIR Source collector manifest directory. Default: DATA_DIR/manifests. + --manifest-dir DIR Upload manifest output directory. Default: DATA_DIR/manifests. + --manifest-path PATH Exact upload manifest path. + --dest REMOTE:PATH rclone destination. Or set ORDERBOOKS_RCLONE_DEST. + --min-age-seconds N Skip files modified within N seconds. Default: 600. + --retention-days N Keep at least N days locally. Default: 7. + --rclone-bin PATH rclone binary path. Default: rclone. + --help Show this help. +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --dry-run) + MODE="dry-run" + shift + ;; + --execute) + MODE="execute" + shift + ;; + --cleanup-after-verify) + CLEANUP_AFTER_VERIFY=1 + shift + ;; + --data-dir) + DATA_DIR="$2" + shift 2 + ;; + --raw-dir) + RAW_DIR="$2" + shift 2 + ;; + --source-manifest-dir) + SOURCE_MANIFEST_DIR="$2" + shift 2 + ;; + --manifest-dir) + MANIFEST_DIR="$2" + shift 2 + ;; + --manifest-path) + MANIFEST_PATH="$2" + shift 2 + ;; + --dest) + DEST="$2" + shift 2 + ;; + --min-age-seconds) + MIN_AGE_SECONDS="$2" + shift 2 + ;; + --retention-days) + RETENTION_DAYS="$2" + shift 2 + ;; + --rclone-bin) + RCLONE_BIN="$2" + shift 2 + ;; + --help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 2 + ;; + esac +done + +if [[ -z "${RAW_DIR}" ]]; then + RAW_DIR="${DATA_DIR%/}/raw_orderbooks" +fi +if [[ -z "${SOURCE_MANIFEST_DIR}" ]]; then + SOURCE_MANIFEST_DIR="${DATA_DIR%/}/manifests" +fi +if [[ -z "${MANIFEST_DIR}" ]]; then + MANIFEST_DIR="${DATA_DIR%/}/manifests" +fi + +STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)" +RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)" +if [[ -z "${MANIFEST_PATH}" ]]; then + MANIFEST_PATH="${MANIFEST_DIR%/}/upload_archive_${RUN_ID}.json" +fi + +TMPDIR="$(mktemp -d)" +trap 'rm -rf "${TMPDIR}"' EXIT + +PLAN_PATH="${TMPDIR}/plan.json" +RCLONE_COPY_LOG="${TMPDIR}/rclone_copy.log" +RCLONE_CHECK_LOG="${TMPDIR}/rclone_check.log" +CLEANUP_PATH="${TMPDIR}/cleanup.json" +STAGING_DIR="${TMPDIR}/stage" + +mkdir -p "$(dirname "${MANIFEST_PATH}")" "${STAGING_DIR}" + +python3 - "$DATA_DIR" "$RAW_DIR" "$SOURCE_MANIFEST_DIR" "$MANIFEST_PATH" "$MIN_AGE_SECONDS" "$STAGING_DIR" "$PLAN_PATH" <<'PY' +import datetime as dt +import hashlib +import json +import os +import shutil +import sys +from pathlib import Path + +data_dir = Path(sys.argv[1]) +raw_dir = Path(sys.argv[2]) +source_manifest_dir = Path(sys.argv[3]) +manifest_path = Path(sys.argv[4]).resolve() +min_age_seconds = int(sys.argv[5]) +staging_dir = Path(sys.argv[6]) +plan_path = Path(sys.argv[7]) +now = dt.datetime.now(dt.UTC) + +def iso_z_from_ts(ts: float) -> str: + return dt.datetime.fromtimestamp(ts, dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + +def rel_for(path: Path) -> str: + resolved = path.resolve() + try: + return resolved.relative_to(data_dir.resolve()).as_posix() + except ValueError: + return resolved.name + +def iter_files(root: Path): + if not root.exists(): + return + for path in sorted(root.rglob("*")): + if path.is_file(): + yield path + +selected = [] +skipped = [] +warnings = [] +seen = set() + +for root, kind in [(raw_dir, "raw"), (source_manifest_dir, "manifest")]: + if not root.exists(): + warnings.append(f"{kind} source directory does not exist: {root}") + continue + for path in iter_files(root): + resolved = path.resolve() + if resolved in seen: + continue + seen.add(resolved) + rel = rel_for(path) + stat = path.stat() + age_seconds = max(0, int(now.timestamp() - stat.st_mtime)) + base = { + "local_path": str(path), + "relative_path": rel, + "kind": kind, + "bytes": stat.st_size, + "mtime_utc": iso_z_from_ts(stat.st_mtime), + "age_seconds": age_seconds, + } + if resolved == manifest_path: + skipped.append({**base, "reason": "current_upload_manifest"}) + continue + if age_seconds < min_age_seconds: + skipped.append({**base, "reason": "modified_within_min_age_seconds"}) + continue + checksum = sha256_file(path) + staged_path = staging_dir / rel + staged_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(path, staged_path) + selected.append({**base, "sha256": checksum, "staged_path": str(staged_path)}) + +plan = { + "selected_files": selected, + "skipped_files": skipped, + "warnings": warnings, +} +plan_path.write_text(json.dumps(plan, indent=2, sort_keys=True) + "\n", encoding="utf-8") +PY + +RCLONE_AVAILABLE=0 +RCLONE_VERSION="" +if command -v "${RCLONE_BIN}" >/dev/null 2>&1; then + RCLONE_AVAILABLE=1 + RCLONE_VERSION="$("${RCLONE_BIN}" version 2>/dev/null | head -n 1 || true)" +fi + +DEST_CONFIGURED=0 +if [[ -n "${DEST}" ]]; then + DEST_CONFIGURED=1 +fi + +COPY_EXIT_CODE="" +CHECK_EXIT_CODE="" +COPY_ATTEMPTED=0 +CHECK_ATTEMPTED=0 +OPERATION_STATUS="PLANNED" +GATE_STATUS="BLOCKED_REAL_UPLOAD" + +if [[ "${DEST_CONFIGURED}" -eq 0 ]]; then + OPERATION_STATUS="BLOCKED_DEST_MISSING" +elif [[ "${RCLONE_AVAILABLE}" -eq 0 ]]; then + OPERATION_STATUS="BLOCKED_RCLONE_UNAVAILABLE" +else + COPY_ATTEMPTED=1 + copy_args=(copy "${STAGING_DIR}/" "${DEST%/}/" --checksum --transfers "${TRANSFERS}" --checkers "${CHECKERS}") + if [[ "${MODE}" == "dry-run" ]]; then + copy_args+=(--dry-run) + fi + "${RCLONE_BIN}" "${copy_args[@]}" >"${RCLONE_COPY_LOG}" 2>&1 + COPY_EXIT_CODE=$? + if [[ "${COPY_EXIT_CODE}" -eq 0 && "${MODE}" == "dry-run" ]]; then + OPERATION_STATUS="DRY_RUN_PASS" + elif [[ "${COPY_EXIT_CODE}" -eq 0 ]]; then + CHECK_ATTEMPTED=1 + "${RCLONE_BIN}" check "${STAGING_DIR}/" "${DEST%/}/" --one-way --checksum >"${RCLONE_CHECK_LOG}" 2>&1 + CHECK_EXIT_CODE=$? + if [[ "${CHECK_EXIT_CODE}" -eq 0 ]]; then + OPERATION_STATUS="UPLOAD_VERIFIED" + GATE_STATUS="PASS" + else + OPERATION_STATUS="VERIFY_FAILED" + GATE_STATUS="FAIL" + fi + else + OPERATION_STATUS="COPY_FAILED" + GATE_STATUS="FAIL" + fi +fi + +python3 - "$PLAN_PATH" "$CLEANUP_PATH" "$MODE" "$CLEANUP_AFTER_VERIFY" "$RETENTION_DAYS" "$OPERATION_STATUS" "$GATE_STATUS" <<'PY' +import datetime as dt +import json +import sys +from pathlib import Path + +plan_path = Path(sys.argv[1]) +cleanup_path = Path(sys.argv[2]) +mode = sys.argv[3] +cleanup_after_verify = sys.argv[4] == "1" +retention_days = int(sys.argv[5]) +operation_status = sys.argv[6] +gate_status = sys.argv[7] +plan = json.loads(plan_path.read_text()) +now = dt.datetime.now(dt.UTC) +cutoff = now - dt.timedelta(days=retention_days) +retained = [] +deleted = [] + +if mode == "execute" and cleanup_after_verify and operation_status == "UPLOAD_VERIFIED": + for item in plan["selected_files"]: + path = Path(item["local_path"]) + mtime = dt.datetime.fromtimestamp(path.stat().st_mtime, dt.UTC) if path.exists() else now + if mtime < cutoff and path.exists(): + path.unlink() + deleted.append({**item, "deleted_at_utc": now.replace(microsecond=0).isoformat().replace("+00:00", "Z")}) + else: + retained.append({**item, "reason": "within_retention_window" if mtime >= cutoff else "missing_before_cleanup"}) +else: + reason = "cleanup_not_requested" + if mode != "execute": + reason = "dry_run" + elif operation_status != "UPLOAD_VERIFIED": + reason = "not_verified" + for item in plan["selected_files"]: + retained.append({**item, "reason": reason}) + +cleanup_path.write_text( + json.dumps({"retained_local_files": retained, "deleted_local_files": deleted}, indent=2, sort_keys=True) + "\n", + encoding="utf-8", +) +PY + +ENDED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + +export SCRIPT_NAME SCRIPT_VERSION STARTED_AT ENDED_AT +export MODE OPERATION_STATUS GATE_STATUS +export RCLONE_BIN RCLONE_AVAILABLE RCLONE_VERSION DEST +export COPY_ATTEMPTED CHECK_ATTEMPTED COPY_EXIT_CODE CHECK_EXIT_CODE +export DATA_DIR RAW_DIR SOURCE_MANIFEST_DIR MIN_AGE_SECONDS RETENTION_DAYS CLEANUP_AFTER_VERIFY + +python3 - "$PLAN_PATH" "$CLEANUP_PATH" "$MANIFEST_PATH" <<'PY' +import json +import os +import sys +from pathlib import Path + +plan = json.loads(Path(sys.argv[1]).read_text()) +cleanup = json.loads(Path(sys.argv[2]).read_text()) +manifest_path = Path(sys.argv[3]) + +mode = os.environ["MODE"] +operation_status = os.environ["OPERATION_STATUS"] +gate_status = os.environ["GATE_STATUS"] +copy_attempted = os.environ["COPY_ATTEMPTED"] == "1" +check_attempted = os.environ["CHECK_ATTEMPTED"] == "1" +copy_exit_code = os.environ["COPY_EXIT_CODE"] +check_exit_code = os.environ["CHECK_EXIT_CODE"] +dest = os.environ["DEST"] + +def public_item(item): + public = dict(item) + public.pop("staged_path", None) + return public + +selected = [public_item(item) for item in plan["selected_files"]] +skipped = [public_item(item) for item in plan["skipped_files"]] +retained_local = [public_item(item) for item in cleanup["retained_local_files"]] +deleted_local = [public_item(item) for item in cleanup["deleted_local_files"]] +attempted_files = selected if copy_attempted else [] +uploaded_files = selected if mode == "execute" and operation_status in {"UPLOAD_VERIFIED", "VERIFY_FAILED"} else [] +verified_files = selected if mode == "execute" and operation_status == "UPLOAD_VERIFIED" else [] +dry_run_files = selected if mode == "dry-run" and operation_status == "DRY_RUN_PASS" else [] + +manifest = { + "schema_name": "upload_archive_manifest", + "schema_version": 1, + "checkpoint_id": 7, + "checkpoint_name": "Google Drive Offload", + "uploader": { + "name": os.environ["SCRIPT_NAME"], + "version": os.environ["SCRIPT_VERSION"], + }, + "started_at_utc": os.environ["STARTED_AT"], + "ended_at_utc": os.environ["ENDED_AT"], + "command_mode": mode, + "operation_status": operation_status, + "gate_status": gate_status, + "rclone": { + "binary": os.environ["RCLONE_BIN"], + "available": os.environ["RCLONE_AVAILABLE"] == "1", + "version": os.environ["RCLONE_VERSION"], + "destination_configured": bool(dest), + "destination": dest if dest else None, + "copy_attempted": copy_attempted, + "copy_exit_code": int(copy_exit_code) if copy_exit_code else None, + "check_attempted": check_attempted, + "check_exit_code": int(check_exit_code) if check_exit_code else None, + }, + "config": { + "data_dir": os.environ["DATA_DIR"], + "raw_dir": os.environ["RAW_DIR"], + "source_manifest_dir": os.environ["SOURCE_MANIFEST_DIR"], + "manifest_path": str(manifest_path), + "min_age_seconds": int(os.environ["MIN_AGE_SECONDS"]), + "retention_days": int(os.environ["RETENTION_DAYS"]), + "cleanup_after_verify": os.environ["CLEANUP_AFTER_VERIFY"] == "1", + }, + "planned_files": selected, + "attempted_files": attempted_files, + "dry_run_files": dry_run_files, + "uploaded_files": uploaded_files, + "verified_files": verified_files, + "skipped_open_or_recent_files": [ + item for item in skipped if item.get("reason") == "modified_within_min_age_seconds" + ], + "skipped_files": skipped, + "retained_local_files": retained_local, + "deleted_local_files": deleted_local, + "counts": { + "planned": len(selected), + "attempted": len(attempted_files), + "dry_run": len(dry_run_files), + "uploaded": len(uploaded_files), + "verified": len(verified_files), + "skipped": len(skipped), + "retained_local": len(retained_local), + "deleted_local": len(deleted_local), + }, + "warnings": plan["warnings"], + "known_gaps": [ + "A dry-run does not prove remote write access.", + "Real upload requires a configured rclone remote outside the repository.", + "Local files are retained unless --cleanup-after-verify is used after successful verification.", + ], +} + +if operation_status == "BLOCKED_RCLONE_UNAVAILABLE": + manifest["warnings"].append("rclone binary was not available; copy and verification were not attempted.") +if operation_status == "BLOCKED_DEST_MISSING": + manifest["warnings"].append("No rclone destination was configured; set --dest or ORDERBOOKS_RCLONE_DEST.") +if mode == "dry-run": + manifest["warnings"].append("Dry-run mode does not perform a real upload; checkpoint real-upload gate remains blocked.") + +manifest_path.parent.mkdir(parents=True, exist_ok=True) +manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") + +print( + json.dumps( + { + "gate_status": gate_status, + "operation_status": operation_status, + "manifest_path": str(manifest_path), + "planned_files": len(selected), + "attempted_files": len(attempted_files), + "uploaded_files": len(uploaded_files), + "verified_files": len(verified_files), + "skipped_files": len(plan["skipped_files"]), + }, + indent=2, + sort_keys=True, + ) +) +PY + +case "${OPERATION_STATUS}" in + UPLOAD_VERIFIED|DRY_RUN_PASS) + exit 0 + ;; + BLOCKED_DEST_MISSING) + echo "No rclone destination configured. Set --dest or ORDERBOOKS_RCLONE_DEST." >&2 + exit 2 + ;; + BLOCKED_RCLONE_UNAVAILABLE) + echo "rclone is not available. Install rclone before running dry-run or execute mode." >&2 + exit 3 + ;; + *) + echo "Upload operation failed with status: ${OPERATION_STATUS}" >&2 + exit 1 + ;; +esac diff --git a/scripts/vps_preflight_check.sh b/scripts/vps_preflight_check.sh new file mode 100755 index 0000000..741b102 --- /dev/null +++ b/scripts/vps_preflight_check.sh @@ -0,0 +1,285 @@ +#!/usr/bin/env bash +set -uo pipefail + +APP_DIR="$(pwd)" +PYTHON_BIN="${ORDERBOOKS_PYTHON:-python3}" +RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-rclone}" +RCLONE_REMOTE="${ORDERBOOKS_RCLONE_DEST:-}" +DATA_DIR="" +MANIFEST_DIR="" +LOG_DIR="" +MIN_FREE_GIB="${ORDERBOOKS_PREFLIGHT_MIN_FREE_GIB:-5}" +REMOTE_TIMEOUT_SECONDS="${ORDERBOOKS_PREFLIGHT_REMOTE_TIMEOUT_SECONDS:-30}" + +FAILURES=0 +WARNINGS=0 + +usage() { + cat <<'EOF' +Usage: scripts/vps_preflight_check.sh [options] + +Read-only VPS cutover preflight for the Polymarket order-book collector. + +Default behavior checks the repository, local tooling, unit syntax, disk space, +and rclone availability. It does not print rclone config and does not require +secrets. + +Options: + --app-dir DIR Repository checkout path. Default: current directory. + --python-bin PATH Python interpreter. Default: ORDERBOOKS_PYTHON or python3. + --rclone-bin PATH rclone binary. Default: ORDERBOOKS_RCLONE_BIN or rclone. + --rclone-remote REMOTE Optional remote/path to check read-only, e.g. gdrive:orderbooks/polymarket. + --data-dir DIR Optional target data directory to create/check writable. + --manifest-dir DIR Optional target manifest directory to create/check writable. + --log-dir DIR Optional target log directory to create/check writable. + --min-free-gib N Minimum free GiB for checked filesystems. Default: 5. + --remote-timeout-seconds N Timeout for rclone remote read check. Default: 30. + --help Show this help. + +Directory options intentionally create missing directories before checking +writability. Omit them for a repo-only read-only check. +EOF +} + +log_pass() { printf 'PASS %s\n' "$*"; } +log_info() { printf 'INFO %s\n' "$*"; } +log_warn() { WARNINGS=$((WARNINGS + 1)); printf 'WARN %s\n' "$*"; } +log_fail() { FAILURES=$((FAILURES + 1)); printf 'FAIL %s\n' "$*"; } +run_quiet() { "$@" >/dev/null 2>&1; } + +while [[ $# -gt 0 ]]; do + case "$1" in + --app-dir) APP_DIR="$2"; shift 2 ;; + --python-bin) PYTHON_BIN="$2"; shift 2 ;; + --rclone-bin) RCLONE_BIN="$2"; shift 2 ;; + --rclone-remote) RCLONE_REMOTE="$2"; shift 2 ;; + --data-dir) DATA_DIR="$2"; shift 2 ;; + --manifest-dir) MANIFEST_DIR="$2"; shift 2 ;; + --log-dir) LOG_DIR="$2"; shift 2 ;; + --min-free-gib) MIN_FREE_GIB="$2"; shift 2 ;; + --remote-timeout-seconds) REMOTE_TIMEOUT_SECONDS="$2"; shift 2 ;; + --help) usage; exit 0 ;; + *) log_fail "unknown argument: $1"; usage >&2; exit 2 ;; + esac +done + +APP_DIR="${APP_DIR%/}" +if [[ ! -d "${APP_DIR}" ]]; then + log_fail "app directory does not exist: ${APP_DIR}" + printf 'SUMMARY failures=%s warnings=%s\n' "${FAILURES}" "${WARNINGS}" + exit 1 +fi + +cd "${APP_DIR}" || { + log_fail "could not cd to app directory: ${APP_DIR}" + printf 'SUMMARY failures=%s warnings=%s\n' "${FAILURES}" "${WARNINGS}" + exit 1 +} + +check_python() { + if command -v "${PYTHON_BIN}" >/dev/null 2>&1; then + version="$("${PYTHON_BIN}" --version 2>&1 || true)" + log_pass "python available: ${PYTHON_BIN} (${version})" + else + log_fail "python not found: ${PYTHON_BIN}" + fi +} + +check_required_files() { + local missing=0 file + local required=( + "scripts/discover_polymarket_btc_markets.py" + "scripts/collect_polymarket_orderbooks.py" + "scripts/normalize_polymarket_orderbooks.py" + "scripts/run_polymarket_collector_cycle.sh" + "scripts/upload_archive_rclone.sh" + "scripts/vps_runtime_smoke_check.sh" + "config/polymarket_collector.vps.example.yaml" + "docs/VPS_DEPLOYMENT.md" + "docs/GOOGLE_DRIVE_OFFLOAD.md" + "systemd/polymarket-orderbook-collector.service" + "systemd/polymarket-orderbook-uploader.service" + "systemd/polymarket-orderbook-uploader.timer" + ) + for file in "${required[@]}"; do + if [[ -f "${file}" ]]; then + log_pass "required file exists: ${file}" + else + missing=1 + log_fail "required file missing: ${file}" + fi + done + return "${missing}" +} + +check_python_compile() { + if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then + log_fail "cannot compile Python scripts because Python is missing" + return + fi + if run_quiet "${PYTHON_BIN}" - <<'PY' +from pathlib import Path + +paths = [ + Path("scripts/discover_polymarket_btc_markets.py"), + Path("scripts/collect_polymarket_orderbooks.py"), + Path("scripts/normalize_polymarket_orderbooks.py"), +] +for path in paths: + source = path.read_text(encoding="utf-8") + compile(source, str(path), "exec") +PY + then + log_pass "collector/discovery/normalization Python scripts compile without bytecode writes" + else + log_fail "Python no-bytecode compile check failed" + fi +} + +check_shell_syntax() { + local failed=0 script + for script in scripts/*.sh; do + [[ -f "${script}" ]] || continue + if bash -n "${script}" >/dev/null 2>&1; then + log_pass "bash syntax ok: ${script}" + else + failed=1 + log_fail "bash syntax failed: ${script}" + fi + done + return "${failed}" +} + +check_systemd_units() { + local units=( + "systemd/polymarket-orderbook-collector.service" + "systemd/polymarket-orderbook-uploader.service" + "systemd/polymarket-orderbook-uploader.timer" + ) + if command -v systemd-analyze >/dev/null 2>&1; then + if systemd-analyze verify "${units[@]}" >/dev/null 2>&1; then + log_pass "systemd units parse with systemd-analyze" + else + log_fail "systemd-analyze verify failed for one or more units" + fi + else + log_warn "systemd-analyze unavailable; skipped unit parse check" + fi +} + +remote_name_from_dest() { + local dest="$1" + case "${dest}" in + *:*) printf '%s:\n' "${dest%%:*}" ;; + *) printf '\n' ;; + esac +} + +run_with_timeout() { + if command -v timeout >/dev/null 2>&1; then + timeout "${REMOTE_TIMEOUT_SECONDS}" "$@" + else + "$@" + fi +} + +check_rclone() { + if [[ -x "${RCLONE_BIN}" ]] || command -v "${RCLONE_BIN}" >/dev/null 2>&1; then + version="$("${RCLONE_BIN}" version 2>/dev/null | head -n 1 || true)" + log_pass "rclone available: ${RCLONE_BIN} (${version})" + else + log_fail "rclone not found: ${RCLONE_BIN}" + return + fi + + if [[ -z "${RCLONE_REMOTE}" ]]; then + log_info "no rclone remote provided; skipped remote access check" + return + fi + + local remote_name + remote_name="$(remote_name_from_dest "${RCLONE_REMOTE}")" + if [[ -z "${remote_name}" ]]; then + log_fail "rclone remote must include a remote name ending in ':': ${RCLONE_REMOTE}" + return + fi + + if "${RCLONE_BIN}" listremotes 2>/dev/null | grep -Fxq "${remote_name}"; then + log_pass "rclone remote is configured: ${remote_name}" + else + log_fail "rclone remote is not configured or not visible to this user: ${remote_name}" + return + fi + + if run_with_timeout "${RCLONE_BIN}" lsf --max-depth 1 "${RCLONE_REMOTE}" >/dev/null 2>&1; then + log_pass "rclone remote read check succeeded without printing config: ${RCLONE_REMOTE}" + else + log_fail "rclone remote read check failed or timed out: ${RCLONE_REMOTE}" + fi +} + +check_target_dir() { + local label="$1" path="$2" + if [[ -z "${path}" ]]; then + log_info "no ${label} directory provided; skipped create/write check" + return + fi + if mkdir -p "${path}" >/dev/null 2>&1 && [[ -d "${path}" && -w "${path}" ]]; then + log_pass "${label} directory exists and is writable: ${path}" + else + log_fail "${label} directory cannot be created or is not writable: ${path}" + fi +} + +check_disk_free() { + local target="$1" label="$2" available_kib min_kib + if [[ ! -e "${target}" ]]; then + log_warn "disk target does not exist, skipping ${label}: ${target}" + return + fi + available_kib="$(df -Pk "${target}" | awk 'NR==2 {print $4}')" + min_kib=$((MIN_FREE_GIB * 1024 * 1024)) + if [[ -n "${available_kib}" && "${available_kib}" -ge "${min_kib}" ]]; then + log_pass "disk free ok for ${label}: available_kib=${available_kib} min_gib=${MIN_FREE_GIB}" + else + log_fail "disk free below threshold for ${label}: available_kib=${available_kib:-unknown} min_gib=${MIN_FREE_GIB}" + fi +} + +check_secret_requirements() { + local files=( + "config/polymarket_collector.vps.example.yaml" + "systemd/polymarket-orderbook-collector.service" + "systemd/polymarket-orderbook-uploader.service" + "systemd/polymarket-orderbook-uploader.timer" + "scripts/run_polymarket_collector_cycle.sh" + "scripts/upload_archive_rclone.sh" + ) + if grep -E -i '(api[_-]?key|private[_-]?key|mnemonic|wallet|password|client[_-]?secret|access[_-]?token|refresh[_-]?token)' "${files[@]}" >/dev/null 2>&1; then + log_fail "secret-like credential requirement found in runtime config, units, or scripts" + else + log_pass "no API keys, private keys, mnemonics, wallets, or passwords are required by runtime files" + fi + log_info "rclone credentials, if used, must remain machine-local outside the repository" +} + +check_python +check_required_files +check_python_compile +check_shell_syntax +check_systemd_units +check_rclone +check_target_dir "data" "${DATA_DIR}" +check_target_dir "manifest" "${MANIFEST_DIR}" +check_target_dir "log" "${LOG_DIR}" +check_disk_free "." "repository" +if [[ -n "${DATA_DIR}" && -d "${DATA_DIR}" ]]; then + check_disk_free "${DATA_DIR}" "data directory" +fi +check_secret_requirements + +printf 'SUMMARY failures=%s warnings=%s\n' "${FAILURES}" "${WARNINGS}" +if [[ "${FAILURES}" -eq 0 ]]; then + exit 0 +fi +exit 1 diff --git a/scripts/vps_runtime_smoke_check.sh b/scripts/vps_runtime_smoke_check.sh new file mode 100755 index 0000000..dc7982d --- /dev/null +++ b/scripts/vps_runtime_smoke_check.sh @@ -0,0 +1,279 @@ +#!/usr/bin/env bash +set -uo pipefail + +APP_DIR="${ORDERBOOKS_APP_DIR:-/opt/orderbooks}" +DATA_DIR="${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}" +RAW_DIR="${ORDERBOOKS_OUTPUT_DIR:-${DATA_DIR}/raw_orderbooks}" +MANIFEST_DIR="${ORDERBOOKS_MANIFEST_DIR:-${DATA_DIR}/manifests}" +COLLECTOR_SERVICE="${ORDERBOOKS_COLLECTOR_SERVICE:-polymarket-orderbook-collector.service}" +UPLOADER_SERVICE="${ORDERBOOKS_UPLOADER_SERVICE:-polymarket-orderbook-uploader.service}" +WAIT_SECONDS="${ORDERBOOKS_SMOKE_WAIT_SECONDS:-900}" +RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)" +EVIDENCE_PATH="${ORDERBOOKS_SMOKE_EVIDENCE_PATH:-${MANIFEST_DIR}/vps_runtime_smoke_${RUN_ID}.json}" +PYTHON_BIN="${ORDERBOOKS_PYTHON:-python3}" + +usage() { + cat <<'EOF' +Usage: scripts/vps_runtime_smoke_check.sh [options] + +Run on the VPS after installing collector/uploader systemd units. The check +records durable JSON evidence, forces one collector service restart, verifies +old raw gzip files still parse and keep their checksum, waits for a later valid +collector cycle, then starts the uploader service and records upload evidence. + +Options: + --app-dir DIR App checkout. Default: /opt/orderbooks. + --data-dir DIR Data root. Default: /var/lib/orderbooks. + --raw-dir DIR Raw output dir. Default: DATA_DIR/raw_orderbooks. + --manifest-dir DIR Manifest dir. Default: DATA_DIR/manifests. + --collector-service NAME systemd collector service name. + --uploader-service NAME systemd uploader service name. + --wait-seconds N Max wait for valid cycles. Default: 900. + --evidence-path PATH JSON evidence output path. + --help Show this help. + +This script does not delete raw files or manifests. Failures are written to the +evidence JSON and should be preserved for review. +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --app-dir) APP_DIR="$2"; shift 2 ;; + --data-dir) DATA_DIR="$2"; RAW_DIR="${ORDERBOOKS_OUTPUT_DIR:-$2/raw_orderbooks}"; MANIFEST_DIR="${ORDERBOOKS_MANIFEST_DIR:-$2/manifests}"; shift 2 ;; + --raw-dir) RAW_DIR="$2"; shift 2 ;; + --manifest-dir) MANIFEST_DIR="$2"; shift 2 ;; + --collector-service) COLLECTOR_SERVICE="$2"; shift 2 ;; + --uploader-service) UPLOADER_SERVICE="$2"; shift 2 ;; + --wait-seconds) WAIT_SECONDS="$2"; shift 2 ;; + --evidence-path) EVIDENCE_PATH="$2"; shift 2 ;; + --help) usage; exit 0 ;; + *) echo "Unknown argument: $1" >&2; usage >&2; exit 2 ;; + esac +done + +mkdir -p "$(dirname "${EVIDENCE_PATH}")" + +PYTHONDONTWRITEBYTECODE=1 "${PYTHON_BIN}" - "$APP_DIR" "$DATA_DIR" "$RAW_DIR" "$MANIFEST_DIR" "$COLLECTOR_SERVICE" "$UPLOADER_SERVICE" "$WAIT_SECONDS" "$EVIDENCE_PATH" <<'PY_SMOKE' +import datetime as dt +import gzip +import hashlib +import json +import subprocess +import sys +import time +from pathlib import Path + +app_dir = Path(sys.argv[1]) +data_dir = Path(sys.argv[2]) +raw_dir = Path(sys.argv[3]) +manifest_dir = Path(sys.argv[4]) +collector_service = sys.argv[5] +uploader_service = sys.argv[6] +wait_seconds = int(sys.argv[7]) +evidence_path = Path(sys.argv[8]) +started = dt.datetime.now(dt.UTC).replace(microsecond=0) +checks = [] +failures = [] + + +def iso_now(): + return dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z') + + +def run(command): + proc = subprocess.run(command, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + item = { + 'command': command, + 'exit_code': proc.returncode, + 'stdout_tail': proc.stdout[-4000:], + 'stderr_tail': proc.stderr[-4000:], + 'ran_at_utc': iso_now(), + } + checks.append(item) + return item + + +def sha256(path): + digest = hashlib.sha256() + with path.open('rb') as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b''): + digest.update(chunk) + return digest.hexdigest() + + +def parse_raw(path): + rows = 0 + first_keys = [] + with gzip.open(path, 'rt', encoding='utf-8') as handle: + for line in handle: + if not line.strip(): + continue + obj = json.loads(line) + if rows == 0: + first_keys = sorted(obj.keys()) + rows += 1 + return rows, first_keys + + +def collector_manifests(): + if not manifest_dir.exists(): + return [] + return sorted(manifest_dir.glob('polymarket_orderbook_collector_*.json'), key=lambda path: path.stat().st_mtime) + + +def validate_collector(path): + manifest = json.loads(path.read_text(encoding='utf-8')) + output_files = [] + for item in manifest.get('output_files', []): + raw_path = Path(item['path']) + rows, first_keys = parse_raw(raw_path) + actual_sha = sha256(raw_path) + output_files.append({ + 'path': str(raw_path), + 'bytes': raw_path.stat().st_size, + 'manifest_rows': item.get('rows'), + 'rows_parsed': rows, + 'row_count_matches_manifest': rows == item.get('rows'), + 'manifest_sha256': item.get('sha256'), + 'actual_sha256': actual_sha, + 'sha256_matches_manifest': actual_sha == item.get('sha256'), + 'first_row_keys': first_keys, + 'under_raw_dir': raw_path.resolve().is_relative_to(raw_dir.resolve()), + 'uses_live_sample_path': 'live_sample' in raw_path.parts, + }) + valid = ( + manifest.get('gate_status') == 'PASS' + and manifest.get('rows_written', 0) > 0 + and manifest.get('failure_count') == 0 + and not manifest.get('failures') + and bool(output_files) + and all(item['rows_parsed'] > 0 and item['row_count_matches_manifest'] and item['sha256_matches_manifest'] and item['under_raw_dir'] and not item['uses_live_sample_path'] for item in output_files) + ) + return { + 'path': str(path), + 'manifest': manifest, + 'output_files': output_files, + 'valid': valid, + } + + +def latest_valid_after(after_mtime=0): + deadline = time.time() + wait_seconds + last_error = None + while time.time() <= deadline: + for path in reversed(collector_manifests()): + if path.stat().st_mtime <= after_mtime: + continue + try: + result = validate_collector(path) + except Exception as exc: + last_error = str(exc) + continue + if result['valid']: + return result + last_error = f"latest candidate invalid: {path}" + time.sleep(10) + raise TimeoutError(last_error or f'no valid collector manifest after mtime {after_mtime}') + + +def latest_upload_after(after_mtime=0): + candidates = sorted(manifest_dir.glob('upload_archive_*.json'), key=lambda path: path.stat().st_mtime) + candidates = [path for path in candidates if path.stat().st_mtime >= after_mtime] + if not candidates: + raise FileNotFoundError('no upload_archive_*.json manifest found after uploader run') + path = candidates[-1] + manifest = json.loads(path.read_text(encoding='utf-8')) + verified_count = manifest.get('counts', {}).get('verified', len(manifest.get('verified_files', []))) + return { + 'path': str(path), + 'manifest': manifest, + 'verified_count': verified_count, + 'valid': manifest.get('operation_status') == 'UPLOAD_VERIFIED' and manifest.get('gate_status') == 'PASS' and manifest.get('rclone', {}).get('copy_exit_code') == 0 and manifest.get('rclone', {}).get('check_exit_code') == 0 and verified_count > 0, + } + +summary = { + 'schema_name': 'vps_runtime_smoke_result', + 'schema_version': 1, + 'started_at_utc': started.isoformat().replace('+00:00', 'Z'), + 'ended_at_utc': None, + 'gate_status': 'ERROR', + 'production_ready': False, + 'app_dir': str(app_dir), + 'data_dir': str(data_dir), + 'raw_dir': str(raw_dir), + 'manifest_dir': str(manifest_dir), + 'collector_service': collector_service, + 'uploader_service': uploader_service, + 'wait_seconds': wait_seconds, + 'checks': checks, + 'failures': failures, +} + +try: + active = run(['systemctl', 'is-active', collector_service]) + if active['exit_code'] != 0: + failures.append('collector service is not active under systemd') + raise RuntimeError('collector service not active') + + before = latest_valid_after(0) + before_mtime = Path(before['path']).stat().st_mtime + old_raw = before['output_files'][0] + old_raw_sha = old_raw['actual_sha256'] + old_raw_path = Path(old_raw['path']) + + restart = run(['systemctl', 'restart', collector_service]) + if restart['exit_code'] != 0: + failures.append('collector service restart command failed') + raise RuntimeError('restart failed') + active_after = run(['systemctl', 'is-active', collector_service]) + if active_after['exit_code'] != 0: + failures.append('collector service is not active after restart') + raise RuntimeError('collector inactive after restart') + + after = latest_valid_after(before_mtime) + old_rows_after, _ = parse_raw(old_raw_path) + old_file_unchanged = sha256(old_raw_path) == old_raw_sha and old_rows_after == old_raw['rows_parsed'] + if not old_file_unchanged: + failures.append('raw file from before restart changed or stopped parsing') + + upload_start_mtime = time.time() + upload_run = run(['systemctl', 'start', uploader_service]) + if upload_run['exit_code'] != 0: + failures.append('uploader service start failed') + try: + upload = latest_upload_after(upload_start_mtime - 2) + if not upload.get('valid'): + failures.append('uploader did not produce a verified upload manifest with at least one verified file') + except Exception as exc: + upload = {'path': None, 'valid': False, 'error': str(exc)} + failures.append(str(exc)) + + collector_logs = run(['journalctl', '-u', collector_service, '-n', '80', '--no-pager']) + uploader_logs = run(['journalctl', '-u', uploader_service, '-n', '80', '--no-pager']) + + summary.update({ + 'before_restart_collector': before, + 'after_restart_collector': after, + 'old_raw_file_unchanged_after_restart': old_file_unchanged, + 'upload_result': upload, + 'collector_log_check_exit_code': collector_logs['exit_code'], + 'uploader_log_check_exit_code': uploader_logs['exit_code'], + }) + if after['valid'] and old_file_unchanged and upload.get('valid') and not failures: + summary['gate_status'] = 'PASS' + else: + summary['gate_status'] = 'FAIL' +except Exception as exc: + failures.append(str(exc)) + summary['exception'] = repr(exc) +finally: + summary['ended_at_utc'] = iso_now() + evidence_path.parent.mkdir(parents=True, exist_ok=True) + evidence_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + '\n', encoding='utf-8') + +print(f"SMOKE_EVIDENCE={evidence_path}") +print(f"SMOKE_GATE={summary['gate_status']}") +if summary['gate_status'] != 'PASS': + sys.exit(1) +PY_SMOKE diff --git a/systemd/polymarket-orderbook-collector.service b/systemd/polymarket-orderbook-collector.service new file mode 100644 index 0000000..37e5519 --- /dev/null +++ b/systemd/polymarket-orderbook-collector.service @@ -0,0 +1,38 @@ +[Unit] +Description=Polymarket raw order-book collector cycle +Documentation=file:/opt/orderbooks/docs/VPS_DEPLOYMENT.md +After=network-online.target +Wants=network-online.target +StartLimitIntervalSec=10min +StartLimitBurst=20 + +[Service] +Type=simple +User=orderbooks +Group=orderbooks +WorkingDirectory=/opt/orderbooks +Environment=PYTHONUNBUFFERED=1 +Environment=ORDERBOOKS_APP_DIR=/opt/orderbooks +Environment=ORDERBOOKS_DATA_DIR=/var/lib/orderbooks +Environment=ORDERBOOKS_OUTPUT_DIR=/var/lib/orderbooks/raw_orderbooks +Environment=ORDERBOOKS_PYTHON=/opt/orderbooks/.venv/bin/python +Environment=ORDERBOOKS_COLLECTOR_CONFIG=/etc/orderbooks/polymarket_collector.vps.yaml +EnvironmentFile=-/etc/orderbooks/polymarket-orderbook-collector.env +ExecStart=/bin/bash /opt/orderbooks/scripts/run_polymarket_collector_cycle.sh +Restart=always +RestartSec=30s +TimeoutStopSec=90s +KillSignal=SIGTERM +KillMode=control-group +StandardOutput=journal +StandardError=journal +SyslogIdentifier=polymarket-orderbook-collector +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=true +ReadWritePaths=/var/lib/orderbooks +StateDirectory=orderbooks + +[Install] +WantedBy=multi-user.target diff --git a/systemd/polymarket-orderbook-uploader.service b/systemd/polymarket-orderbook-uploader.service new file mode 100644 index 0000000..a88f910 --- /dev/null +++ b/systemd/polymarket-orderbook-uploader.service @@ -0,0 +1,29 @@ +[Unit] +Description=Orderbooks archive upload via rclone +Documentation=file:/opt/orderbooks/docs/GOOGLE_DRIVE_OFFLOAD.md +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +User=orderbooks +Group=orderbooks +WorkingDirectory=/opt/orderbooks +Environment=ORDERBOOKS_UPLOAD_DATA_DIR=/var/lib/orderbooks +Environment=ORDERBOOKS_UPLOAD_MANIFEST_DIR=/var/lib/orderbooks/manifests +Environment=ORDERBOOKS_UPLOAD_RAW_DIR=/var/lib/orderbooks/raw_orderbooks +Environment=ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600 +Environment=ORDERBOOKS_UPLOAD_RETENTION_DAYS=7 +Environment=ORDERBOOKS_RCLONE_BIN=/usr/bin/rclone +EnvironmentFile=-/etc/orderbooks/orderbook-uploader.env +ExecStart=/bin/bash /opt/orderbooks/scripts/upload_archive_rclone.sh --execute +StandardOutput=journal +StandardError=journal +SyslogIdentifier=polymarket-orderbook-uploader +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=true +ReadWritePaths=/var/lib/orderbooks +StateDirectory=orderbooks + diff --git a/systemd/polymarket-orderbook-uploader.timer b/systemd/polymarket-orderbook-uploader.timer new file mode 100644 index 0000000..18ef329 --- /dev/null +++ b/systemd/polymarket-orderbook-uploader.timer @@ -0,0 +1,12 @@ +[Unit] +Description=Run orderbooks archive upload periodically +Documentation=file:/opt/orderbooks/docs/GOOGLE_DRIVE_OFFLOAD.md + +[Timer] +OnCalendar=hourly +RandomizedDelaySec=10min +Persistent=true +Unit=polymarket-orderbook-uploader.service + +[Install] +WantedBy=timers.target