Prepare Kubernetes orderbooks deployment

2026-04-18 11:23:28 +02:00 · 2026-04-18 11:23:28 +02:00 · 284e465588
commit 284e465588
42 changed files with 8640 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,26 @@
 .git/
 .venv/
 __pycache__/
 *.pyc
 *.pyo
 .pytest_cache/
 .mypy_cache/
 .ruff_cache/
 artifacts/
 data/
 reports/
 orchestration/
 .env
 *.env
 rclone.conf
 **/rclone.conf
 *.pem
 *.key
 *.p12
 *.pfx
 id_rsa*
 id_ed25519*
 *mnemonic*
 *wallet*
 *credential*
 *secret*
--- a/.forgejo/workflows/deploy.yml
+++ b/.forgejo/workflows/deploy.yml
@ -0,0 +1,162 @@
 name: deploy
 on:
  push:
    branches:
      - main
  workflow_dispatch:
 jobs:
  deploy:
    runs-on: linux-amd64
    env:
      IMAGE_TAG: ${{ github.sha }}
      REGISTRY_HOST: ${{ vars.REGISTRY_HOST }}
      PROJECT_NAME: ${{ vars.PROJECT_NAME || 'orderbooks' }}
      PROJECT_NAMESPACE: ${{ vars.PROJECT_NAMESPACE || 'orderbooks' }}
      PROJECT_DEPLOYMENTS: ${{ vars.PROJECT_DEPLOYMENTS || 'orderbooks-collector' }}
      PROJECT_REGISTRY_SECRET_NAME: ${{ vars.PROJECT_REGISTRY_SECRET_NAME || 'orderbooks-registry-creds' }}
      REPO_CLONE_URL: ${{ github.server_url }}/${{ github.repository }}.git
    steps:
      - name: Install tooling
        run: |
          if command -v git >/dev/null 2>&1 && command -v kubectl >/dev/null 2>&1 && command -v python3 >/dev/null 2>&1; then
            exit 0
          fi
          if command -v apk >/dev/null 2>&1; then
            apk add --no-cache git kubectl python3
            exit 0
          fi
          if command -v apt-get >/dev/null 2>&1; then
            apt-get update
            apt-get install -y git curl ca-certificates python3
            curl -fsSLo /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
            chmod +x /usr/local/bin/kubectl
            exit 0
          fi
          echo "missing git/kubectl/python3 and no supported package manager found" >&2
          exit 1
      - name: Prepare workspace
        run: |
          workspace_root="${RUNNER_TEMP:-/tmp}"
          workspace_dir="$(mktemp -d "${workspace_root%/}/orderbooks-deploy-XXXXXX")"
          echo "WORKSPACE_DIR=$workspace_dir" >> "$GITHUB_ENV"
          echo "runner workspace: $workspace_dir"
      - name: Load kubeconfig
        run: |
          mkdir -p "$HOME/.kube"
          printf '%s' '${{ secrets.KUBECONFIG_B64 }}' | base64 -d > "$HOME/.kube/config"
          kubectl get ns
      - name: Checkout repo
        env:
          REPO_TOKEN: ${{ github.token }}
        run: |
          git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" clone --depth=1 "${REPO_CLONE_URL}" "$WORKSPACE_DIR"
          cd "$WORKSPACE_DIR"
          current_sha="$(git rev-parse HEAD)"
          if [ "$current_sha" != "$GITHUB_SHA" ]; then
            git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" fetch --depth=1 origin "${GITHUB_SHA}"
            git checkout --detach "${GITHUB_SHA}"
          else
            git checkout --detach "$current_sha"
          fi
          git rev-parse HEAD
      - name: Resolve deployment settings
        run: |
          if [ -z "${REGISTRY_HOST:-}" ]; then
            echo "REGISTRY_HOST repo variable is required" >&2
            exit 1
          fi
          IMAGE="$REGISTRY_HOST/$PROJECT_NAME:$IMAGE_TAG"
          BUILD_JOB="image-build-$(printf '%s' "$GITHUB_SHA" | cut -c1-12)"
          {
            echo "IMAGE=$IMAGE"
            echo "BUILD_JOB=$BUILD_JOB"
          } >> "$GITHUB_ENV"
      - name: Ensure namespace exists
        run: |
          kubectl apply -f "$WORKSPACE_DIR/deploy/k8s/base/namespace.yaml"
      - name: Build and push image in-cluster
        env:
          REPO_TOKEN: ${{ github.token }}
        run: |
          kubectl -n "$PROJECT_NAMESPACE" delete job "$BUILD_JOB" --ignore-not-found=true
          cat <<EOF | kubectl apply -f -
          apiVersion: batch/v1
          kind: Job
          metadata:
            name: ${BUILD_JOB}
            namespace: ${PROJECT_NAMESPACE}
          spec:
            backoffLimit: 0
            ttlSecondsAfterFinished: 3600
            template:
              spec:
                restartPolicy: Never
                volumes:
                  - name: workspace
                    emptyDir: {}
                  - name: registry-creds
                    secret:
                      secretName: ${PROJECT_REGISTRY_SECRET_NAME}
                      items:
                        - key: .dockerconfigjson
                          path: config.json
                initContainers:
                  - name: checkout
                    image: alpine/git:2.47.2
                    env:
                      - name: REPO_TOKEN
                        value: ${REPO_TOKEN}
                      - name: REPO_CLONE_URL
                        value: ${REPO_CLONE_URL}
                      - name: GITHUB_SHA
                        value: ${GITHUB_SHA}
                    command: ["/bin/sh", "-lc"]
                    args:
                      - >-
                        git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" clone --depth=1 "${REPO_CLONE_URL}" /workspace &&
                        cd /workspace &&
                        git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" fetch --depth=1 origin "${GITHUB_SHA}" &&
                        git checkout --detach "${GITHUB_SHA}"
                    volumeMounts:
                      - name: workspace
                        mountPath: /workspace
                containers:
                  - name: kaniko
                    image: gcr.io/kaniko-project/executor:v1.23.2-debug
                    args:
                      - --context=/workspace
                      - --dockerfile=/workspace/Dockerfile
                      - --destination=${IMAGE}
                      - --cache=false
                    volumeMounts:
                      - name: workspace
                        mountPath: /workspace
                      - name: registry-creds
                        mountPath: /kaniko/.docker
          EOF
          kubectl -n "$PROJECT_NAMESPACE" wait --for=condition=Complete --timeout=20m "job/$BUILD_JOB"
          kubectl -n "$PROJECT_NAMESPACE" logs "job/$BUILD_JOB"
      - name: Apply release manifests and wait for rollout
        run: |
          kubectl kustomize "$WORKSPACE_DIR/deploy/k8s/base" \
            | IMAGE="$IMAGE" python3 -c 'import os, sys; sys.stdout.write(sys.stdin.read().replace("registry.doran.133011.xyz/orderbooks:bootstrap", os.environ["IMAGE"]))' \
            | kubectl apply -f -
          printf '%s' "$PROJECT_DEPLOYMENTS" | tr ',' '\n' \
            | while IFS= read -r deployment; do
                [ -n "$deployment" ] || continue
                kubectl -n "$PROJECT_NAMESPACE" set image "deployment/$deployment" "*=$IMAGE"
                kubectl -n "$PROJECT_NAMESPACE" rollout status "deployment/$deployment" --timeout=300s
              done
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,43 @@
 # Local runtime data and evidence stay local
 data/
 artifacts/
 reports/
 orchestration/
 # Python/cache/build noise
 __pycache__/
 *.py[cod]
 .pytest_cache/
 .mypy_cache/
 .ruff_cache/
 *.egg-info/
 build/
 dist/
 # Environments and local config
 .venv/
 .env
 *.env
 !.dockerignore
 !.gitignore
 # Kubernetes/rclone/secret material
 kubeconfig*
 *.kubeconfig
 rclone.conf
 **/rclone.conf
 *.pem
 *.key
 *.p12
 *.pfx
 id_rsa*
 id_ed25519*
 *mnemonic*
 *wallet*
 *credential*
 *secret*
 # Editor/OS noise
 .DS_Store
 .idea/
 .vscode/
--- a/AGENTS.md
+++ b/AGENTS.md
@ -0,0 +1,91 @@
 # Agent Instructions
 Project: Cross-Market Live Orderbook Archive
 This repository exists to preserve live market microstructure data that is usually lost: order books, spreads, liquidity, depth, timestamps, request metadata, and enough raw context to later decide whether a trading idea was observable, fillable, and reproducible at the time.
 The first market is Polymarket. Future markets may include NEAR-related venues and other prediction or crypto markets, but do not build generic multi-market infrastructure before the second market exists.
 ## Active Collaboration Model
 This project uses a two-role workflow:
 - `orchestrator`: coordinates checkpoints with the user, keeps scope narrow, records decisions, reviews evidence, states gates, and decides the next smallest step.
 - `builder`: works in a separate session to implement the active checkpoint artifacts, run commands, collect evidence, and write manifests/reports.
 The current primary chat session is the `orchestrator`. The orchestrator should not silently become the builder unless the user explicitly asks. The builder should treat `AGENTS.md`, `ROADMAP.md`, `docs/METHODOLOGY.md`, and the active checkpoint report as the durable source of instructions.
 Hand-offs between orchestrator and builder must be written to disk under `orchestration/` or `reports/checkpoints/` when they contain decisions, scope changes, endpoint findings, or validation results. Chat-only instructions are not enough for project-critical state.
 ## Non-Negotiable Rules
 1. Preserve raw data first. Raw API and websocket payloads are the source of truth. Derived datasets are secondary and must reference raw files.
 2. No trading. Do not add order placement, signing, private-key handling, wallet logic, strategy execution, or bot behavior.
 3. No secrets in the repo. Never commit API keys, rclone credentials, wallet material, cookies, or private endpoints.
 4. Every checkpoint needs durable evidence on disk: code or docs, config or run instructions, manifest/report, and validation evidence.
 5. Do not claim success without commands, outputs, files, checksums, or real collected data to support the claim.
 6. Do not delete mistakes. If an artifact is wrong, misleading, partial, or deprecated, preserve it and label it with a reason and replacement.
 7. Keep the scope narrow. No dashboard, database, ML, strategy, backtest, or generic framework until the roadmap gate allows it.
 8. Public data only unless a later checkpoint explicitly documents why authenticated public-data access is required.
 9. "Production-ready" is forbidden until the collector has completed a documented 24h soak test with acceptable quality.
 ## Expected Workflow
 For each checkpoint:
 1. Define the smallest useful checkpoint.
 2. Build only what is needed for that checkpoint.
 3. Validate with real commands and, when applicable, real public data.
 4. Write a machine-readable manifest and a short markdown note.
 5. State PASS, FAIL, or BLOCKED.
 6. Identify the strongest fake-progress risk.
 7. Recommend the next smallest step.
 8. Stop only when a real user or orchestrator decision is needed.
 ## Repository Conventions
 - `scripts/`: executable probes, discovery scripts, collectors, normalizers, and upload helpers.
 - `config/`: example configuration only. Real secrets and machine-local config stay outside git.
 - `docs/`: durable methodology, data contracts, operational runbooks, and endpoint notes.
 - `orchestration/prompts/`: prompts and templates used by future agents.
 - `data/probes/`: bounded endpoint probe outputs and probe notes.
 - `data/discovery/`: market discovery outputs and manifests.
 - `data/live_sample/`: short sample collector runs.
 - `data/normalized_sample/`: derived sample outputs generated from raw samples.
 - `data/manifests/`: machine-readable manifests for probes, collectors, normalization, uploads, and checkpoints.
 - `reports/`: human-readable checkpoint, soak test, and incident reports.
 - `systemd/`: VPS runtime units when added.
 The initial Polymarket implementation should remain simple scripts until the collector works. Introduce `collectors/<market_name>/` only when adding a second market or when duplication proves painful.
 ## Artifact Status Labels
 Every durable artifact should be treated as one of:
 - `valid`: current and usable.
 - `partial`: useful but incomplete.
 - `deprecated`: superseded by a newer artifact.
 - `invalid`: known to be wrong or misleading.
 When marking an artifact `deprecated` or `invalid`, write a sibling markdown note or manifest entry with:
 - original artifact path
 - status
 - reason
 - replacement path, if any
 - labeled_at_utc
 - labeled_by
 Do not remove the original artifact unless the user explicitly asks and there is a written reason.
 ## Adding New Market Connectors Later
 Before adding a second market, Polymarket must have working discovery, raw order-book collection, Google Drive offload, and a 24h soak test.
 When the gate is met:
 1. Create `collectors/<market_name>/` for market-specific code.
 2. Keep shared code minimal and concrete.
 3. Reuse the same raw-first file layout and manifest format.
 4. Document endpoint quirks, timestamp semantics, rate limits, and schema differences in `docs/`.
 5. Avoid abstract base classes until at least two real collectors expose repeated code that is painful to maintain.
--- a/28
+++ b/28
@ -0,0 +1,28 @@
 FROM python:3.12-slim
 ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    ORDERBOOKS_APP_DIR=/app \
    ORDERBOOKS_DATA_DIR=/var/lib/orderbooks \
    ORDERBOOKS_PYTHON=python3
 RUN apt-get update \
    && apt-get install -y --no-install-recommends bash ca-certificates rclone \
    && rm -rf /var/lib/apt/lists/* \
    && groupadd --system --gid 10001 orderbooks \
    && useradd --system --uid 10001 --gid 10001 --home-dir /var/lib/orderbooks --shell /usr/sbin/nologin orderbooks
 WORKDIR /app
 COPY AGENTS.md ROADMAP.md ./
 COPY config/ config/
 COPY docs/ docs/
 COPY scripts/ scripts/
 RUN chmod +x scripts/*.sh \
    && mkdir -p /var/lib/orderbooks/discovery /var/lib/orderbooks/raw_orderbooks /var/lib/orderbooks/manifests \
    && chown -R orderbooks:orderbooks /var/lib/orderbooks /app
 USER 10001:10001
 CMD ["/bin/bash", "/app/scripts/run_polymarket_collector_loop.sh"]
--- a/ROADMAP.md
+++ b/ROADMAP.md
@ -0,0 +1,212 @@
 # Roadmap
 Project: Cross-Market Live Orderbook Archive
 Goal: build a reliable, minimal, always-on archive of live market microstructure data so future research agents can test whether strategies were actually observable, fillable, and reproducible in real time.
 The roadmap is checkpoint-driven. Each checkpoint must leave durable artifacts, validation evidence, and an explicit gate result.
 ## Current Status
 - Latest completed checkpoint: Checkpoint 7, Google Drive Offload
 - Latest gate: PASS
 - Next checkpoint: Checkpoint 8, 24h Soak Test Plan
 - Initial market: Polymarket
 - Future market work: gated until Polymarket is stable
 ## Checkpoint 1: Project Scaffold And Methodology
 Goal: create the minimum repository structure and rules that keep future agents on track.
 Artifacts:
 - `AGENTS.md`
 - `ROADMAP.md`
 - `docs/METHODOLOGY.md`
 - `docs/DATA_CONTRACT.md`
 - `docs/OPERATIONS.md`
 - `orchestration/prompts/`
 Requirements:
 - Define project goal.
 - Define anti-fake-progress rules.
 - Define raw-first storage policy.
 - Define checkpoint reporting format.
 - Define no-trading/no-private-key policy.
 - Define how to label deprecated or misleading artifacts instead of deleting them.
 - Define how new market connectors should be added later.
 Pass condition: the repo contains durable project rules and the next checkpoint is specific enough to execute.
 ## Checkpoint 2: Polymarket Public Data Source Probe
 Goal: determine exactly which public Polymarket endpoints can support live collection.
 Questions:
 - How to discover active Polymarket markets?
 - How to filter BTC up/down markets?
 - How to resolve conditionId and token IDs?
 - How to fetch current order book for one token?
 - Is there a batch order-book endpoint?
 - Is there a market websocket for order-book updates?
 - Is there a trade websocket or recent trades endpoint?
 - What rate limits are documented or observed?
 - What fields are returned?
 - What timestamps exist?
 Artifacts:
 - `scripts/probe_polymarket_public_sources.py`
 - `data/probes/polymarket_public_sources_probe_v1.json`
 - `data/probes/polymarket_public_sources_probe_v1.md`
 Pass condition: we know the exact endpoint set and can fetch at least one active market metadata record and one current order book.
 ## Checkpoint 3: Minimal BTC Market Discovery
 Goal: build a small script that finds active BTC up/down Polymarket markets and resolves both outcome token IDs.
 Artifacts:
 - `scripts/discover_polymarket_btc_markets.py`
 - `data/discovery/polymarket_btc_markets_latest.json`
 - `data/discovery/polymarket_btc_markets_manifest.json`
 - `data/discovery/polymarket_btc_markets.md`
 Requirements:
 - Public endpoints only.
 - No trading.
 - No API keys unless strictly needed for public data.
 - Never store secrets in the repo.
 - Preserve raw metadata responses.
 - Write normalized market records with slug, question, conditionId, token IDs, outcomes, times, status, source, and `fetched_at_utc`.
 Pass condition: the script reliably outputs currently active BTC markets with token IDs.
 ## Checkpoint 4: Minimal Orderbook Snapshot Collector
 Goal: collect raw order-book snapshots for active BTC markets at a fixed interval.
 Artifacts:
 - `scripts/collect_polymarket_orderbooks.py`
 - `config/polymarket_collector.example.yaml`
 - `data/live_sample/...`
 - `data/manifests/orderbook_collector_sample_manifest.json`
 - `docs/POLYMARKET_COLLECTOR.md`
 Requirements:
 - Collect active BTC markets only.
 - Fetch order books for both outcome tokens.
 - Store raw API responses as gzip JSONL.
 - Add local `collected_at_utc`, collector version, endpoint URL, and request params.
 - Rotate files by hour or run.
 - Include a manifest with timing, markets, request counts, status codes, rows, output files, and checksums.
 - Handle graceful shutdown and rate limits.
 - Do not add a database.
 Pass condition: a 5-10 minute sample run creates valid compressed raw snapshots and a manifest.
 ## Checkpoint 5: Normalized Snapshot Extract
 Goal: create a derived normalized dataset from raw snapshots while preserving raw files as source of truth.
 Artifacts:
 - `scripts/normalize_polymarket_orderbooks.py`
 - `data/normalized_sample/...`
 - `data/manifests/orderbook_normalization_sample_manifest.json`
 - `docs/ORDERBOOK_SCHEMA.md`
 Pass condition: a sample raw file can be normalized and basic sanity checks pass.
 ## Checkpoint 6: VPS Runtime Package
 Goal: make the collector deployable on a small VPS.
 Artifacts:
 - `systemd/polymarket-orderbook-collector.service`
 - `config/polymarket_collector.vps.example.yaml`
 - `scripts/run_polymarket_collector_cycle.sh`
 - `docs/VPS_DEPLOYMENT.md`
 Uploader service and timer units are deferred to Checkpoint 7 with Google Drive
 offload. Creating empty uploader units in Checkpoint 6 would be fake progress.
 Pass condition: a user can follow docs on a VPS and run the collector.
 ## Checkpoint 7: Google Drive Offload
 Goal: add periodic upload to Google Drive using `rclone`.
 Artifacts:
 - `scripts/upload_archive_rclone.sh`
 - `config/rclone.example.md`
 - `docs/GOOGLE_DRIVE_OFFLOAD.md`
 - sample upload manifest format
 Pass condition: a dry-run and a real small test upload succeed and are documented.
 ## Checkpoint 8: 24h Soak Test Plan
 Goal: run the collector for a real 24h period and validate reliability.
 Artifacts:
 - `reports/soak_test_YYYY-MM-DD.md`
 - `data/manifests/...`
 Metrics:
 - uptime
 - markets tracked
 - total snapshots
 - missed interval estimate
 - API errors
 - rate limits
 - file sizes
 - compression ratio
 - Google Drive upload status
 - restart behavior
 - disk usage
 - data quality checks
 Pass condition: a 24h run completes with acceptable data quality and documented issues.
 ## Checkpoint 9: Add Second Market Only After Polymarket Is Stable
 Goal: prepare for NEAR or another market only after Polymarket collector reliability is proven.
 Do not start this checkpoint until:
 - Polymarket discovery works.
 - Polymarket order-book collection works.
 - Google Drive offload works.
 - The 24h soak test is complete.
 Architecture principles:
 - Use `collectors/<market_name>/` only when adding the second market.
 - Keep shared code minimal.
 - Avoid abstract base classes until duplication is painful.
 - Keep raw-first, normalized-second, manifest-always file format consistent across markets.
 ## Anti-Fake-Progress Gates
 - No dashboard before 24h data reliability.
 - No database before the file archive becomes painful.
 - No strategy or backtest code in this project.
 - No live trading.
 - No generic multi-market abstraction before the second market exists.
 - No claiming "production-ready" before a 24h soak test.
 - No deleting bad artifacts; label them deprecated or invalid and write why.
 ## Next Smallest Step
 Checkpoint 2 is next. It should inspect official Polymarket docs and perform bounded public endpoint probes to determine the exact live collection sources, schemas, timestamps, and rate-limit behavior.
--- a/config/polymarket_collector.example.yaml
+++ b/config/polymarket_collector.example.yaml
@ -0,0 +1,20 @@
 # Example config for the bounded Checkpoint 4 Polymarket order-book sample.
 # This file contains no secrets. The collector reads only public endpoints.
 discovery_path: data/discovery/polymarket_btc_markets_latest.json
 output_dir: data/live_sample
 manifest_path: data/manifests/orderbook_collector_sample_manifest.json
 # Keep the default sample deliberately small to avoid unnecessary endpoint load.
 market_limit: 2
 interval_seconds: 30
 duration_seconds: 300
 clob_books_url: https://clob.polymarket.com/books
 request_timeout_seconds: 15
 max_retries: 2
 backoff_seconds: 2
 # Do not start tracking markets too close to their end time. Default covers
 # the 5-minute sample duration plus a 2-minute buffer.
 market_end_safety_seconds: 420
--- a/config/polymarket_collector.vps.example.yaml
+++ b/config/polymarket_collector.vps.example.yaml
@ -0,0 +1,17 @@
 # Checkpoint 6 VPS example config for the raw Polymarket order-book collector.
 # Copy to /etc/orderbooks/polymarket_collector.vps.yaml on a VPS and edit paths
 # if the service uses a different data directory.
 discovery_path: /var/lib/orderbooks/discovery/polymarket_btc_markets_latest.json
 output_dir: /var/lib/orderbooks/raw_orderbooks
 manifest_path: /var/lib/orderbooks/manifests/polymarket_orderbook_collector_latest.json
 market_limit: 2
 interval_seconds: 30
 duration_seconds: 300
 market_end_safety_seconds: 420
 clob_books_url: https://clob.polymarket.com/books
 request_timeout_seconds: 15
 max_retries: 2
 backoff_seconds: 2
--- a/config/rclone.example.md
+++ b/config/rclone.example.md
@ -0,0 +1,76 @@
 # rclone Configuration Example
 Status: valid
 This file documents the expected `rclone` setup for Checkpoint 7. It is not an
 `rclone.conf` file and must not be copied into the repository with private auth
 material.
 ## Remote Name
 The examples use this remote path:
 ```text
 gdrive:orderbooks/polymarket
 ```
 You may choose another remote name or folder. The uploader reads the destination
 from:
 ```text
 ORDERBOOKS_RCLONE_DEST
 ```
 For the systemd service, set it in:
 ```text
 /etc/orderbooks/orderbook-uploader.env
 ```
 Example:
 ```text
 ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
 ```
 Do not place private auth files, browser tokens, API keys, wallet material, or
 session material in this repository.
 ## Configure Google Drive Outside The Repo
 Install `rclone` on the VPS, then configure the remote as the service user or
 with a root-managed config path that the service can read:
 ```sh
 sudo apt-get install -y rclone
 sudo -u orderbooks rclone config
 sudo -u orderbooks rclone lsd gdrive:
 ```
 If the service user uses the default rclone config path, keep that file outside
 the repository under the service user's home/config directory.
 ## Uploader Environment File
 Create:
 ```text
 /etc/orderbooks/orderbook-uploader.env
 ```
 Minimal example:
 ```text
 ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
 ```
 Optional overrides:
 ```text
 ORDERBOOKS_UPLOAD_DATA_DIR=/var/lib/orderbooks
 ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600
 ORDERBOOKS_UPLOAD_RETENTION_DAYS=7
 ORDERBOOKS_RCLONE_BIN=/usr/bin/rclone
 ```
 The environment file belongs on the VPS. Do not commit a machine-local version.
--- a/deploy/k8s/base/configmap.yaml
+++ b/deploy/k8s/base/configmap.yaml
@ -0,0 +1,25 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: orderbooks-collector-config
  namespace: orderbooks
  labels:
    app.kubernetes.io/name: orderbooks
    app.kubernetes.io/part-of: orderbooks
    app.kubernetes.io/component: collector
    app.kubernetes.io/managed-by: kustomize
 data:
  polymarket_collector.yaml: |
    discovery_path: /var/lib/orderbooks/discovery/polymarket_btc_markets_latest.json
    output_dir: /var/lib/orderbooks/raw_orderbooks
    manifest_path: /var/lib/orderbooks/manifests/polymarket_orderbook_collector_latest.json
    market_limit: 2
    interval_seconds: 30
    duration_seconds: 300
    market_end_safety_seconds: 420
    clob_books_url: https://clob.polymarket.com/books
    request_timeout_seconds: 15
    max_retries: 2
    backoff_seconds: 2
--- a/deploy/k8s/base/cronjob-uploader.yaml
+++ b/deploy/k8s/base/cronjob-uploader.yaml
@ -0,0 +1,92 @@
 apiVersion: batch/v1
 kind: CronJob
 metadata:
  name: orderbooks-uploader
  namespace: orderbooks
  labels:
    app.kubernetes.io/name: orderbooks
    app.kubernetes.io/part-of: orderbooks
    app.kubernetes.io/component: uploader
 spec:
  schedule: "*/15 * * * *"
  concurrencyPolicy: Forbid
  successfulJobsHistoryLimit: 3
  failedJobsHistoryLimit: 3
  jobTemplate:
    spec:
      backoffLimit: 0
      ttlSecondsAfterFinished: 86400
      template:
        metadata:
          labels:
            app.kubernetes.io/name: orderbooks
            app.kubernetes.io/part-of: orderbooks
            app.kubernetes.io/component: uploader
        spec:
          restartPolicy: Never
          imagePullSecrets:
            - name: orderbooks-registry-creds
          securityContext:
            runAsNonRoot: true
            runAsUser: 10001
            runAsGroup: 10001
            fsGroup: 10001
            fsGroupChangePolicy: OnRootMismatch
          containers:
            - name: uploader
              image: registry.doran.133011.xyz/orderbooks:bootstrap
              imagePullPolicy: IfNotPresent
              command:
                - /bin/bash
                - /app/scripts/upload_archive_rclone.sh
                - --execute
              env:
                - name: ORDERBOOKS_DATA_DIR
                  value: /var/lib/orderbooks
                - name: ORDERBOOKS_UPLOAD_DATA_DIR
                  value: /var/lib/orderbooks
                - name: ORDERBOOKS_UPLOAD_RAW_DIR
                  value: /var/lib/orderbooks/raw_orderbooks
                - name: ORDERBOOKS_UPLOAD_SOURCE_MANIFEST_DIR
                  value: /var/lib/orderbooks/manifests
                - name: ORDERBOOKS_UPLOAD_MANIFEST_DIR
                  value: /var/lib/orderbooks/manifests
                - name: ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS
                  value: "600"
                - name: ORDERBOOKS_UPLOAD_RETENTION_DAYS
                  value: "7"
                - name: ORDERBOOKS_RCLONE_BIN
                  value: /usr/bin/rclone
                - name: ORDERBOOKS_RCLONE_DEST
                  value: gdrive:orderbooks/polymarket
                - name: RCLONE_CONFIG
                  value: /etc/rclone/rclone.conf
              volumeMounts:
                - name: orderbooks-data
                  mountPath: /var/lib/orderbooks
                - name: rclone-config
                  mountPath: /etc/rclone/rclone.conf
                  subPath: rclone.conf
                  readOnly: true
              resources:
                requests:
                  cpu: 50m
                  memory: 128Mi
                limits:
                  cpu: 500m
                  memory: 512Mi
              securityContext:
                allowPrivilegeEscalation: false
                capabilities:
                  drop:
                    - ALL
          volumes:
            - name: orderbooks-data
              persistentVolumeClaim:
                claimName: orderbooks-data
            - name: rclone-config
              secret:
                secretName: orderbooks-rclone-config
                items:
                  - key: rclone.conf
                    path: rclone.conf
--- a/deploy/k8s/base/deployment-collector.yaml
+++ b/deploy/k8s/base/deployment-collector.yaml
@ -0,0 +1,86 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: orderbooks-collector
  namespace: orderbooks
  labels:
    app.kubernetes.io/name: orderbooks
    app.kubernetes.io/part-of: orderbooks
    app.kubernetes.io/component: collector
 spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app.kubernetes.io/name: orderbooks
      app.kubernetes.io/component: collector
  template:
    metadata:
      labels:
        app.kubernetes.io/name: orderbooks
        app.kubernetes.io/part-of: orderbooks
        app.kubernetes.io/component: collector
    spec:
      terminationGracePeriodSeconds: 120
      imagePullSecrets:
        - name: orderbooks-registry-creds
      securityContext:
        runAsNonRoot: true
        runAsUser: 10001
        runAsGroup: 10001
        fsGroup: 10001
        fsGroupChangePolicy: OnRootMismatch
      containers:
        - name: collector
          image: registry.doran.133011.xyz/orderbooks:bootstrap
          imagePullPolicy: IfNotPresent
          command:
            - /bin/bash
            - /app/scripts/run_polymarket_collector_loop.sh
          env:
            - name: ORDERBOOKS_APP_DIR
              value: /app
            - name: ORDERBOOKS_PYTHON
              value: python3
            - name: ORDERBOOKS_DATA_DIR
              value: /var/lib/orderbooks
            - name: ORDERBOOKS_COLLECTOR_CONFIG
              value: /etc/orderbooks/polymarket_collector.yaml
            - name: ORDERBOOKS_DISCOVERY_DIR
              value: /var/lib/orderbooks/discovery
            - name: ORDERBOOKS_OUTPUT_DIR
              value: /var/lib/orderbooks/raw_orderbooks
            - name: ORDERBOOKS_MANIFEST_DIR
              value: /var/lib/orderbooks/manifests
            - name: ORDERBOOKS_LOOP_SLEEP_SECONDS
              value: "15"
          volumeMounts:
            - name: orderbooks-data
              mountPath: /var/lib/orderbooks
            - name: collector-config
              mountPath: /etc/orderbooks/polymarket_collector.yaml
              subPath: polymarket_collector.yaml
              readOnly: true
          resources:
            requests:
              cpu: 50m
              memory: 128Mi
            limits:
              cpu: 500m
              memory: 512Mi
          securityContext:
            allowPrivilegeEscalation: false
            capabilities:
              drop:
                - ALL
      volumes:
        - name: orderbooks-data
          persistentVolumeClaim:
            claimName: orderbooks-data
        - name: collector-config
          configMap:
            name: orderbooks-collector-config
            items:
              - key: polymarket_collector.yaml
                path: polymarket_collector.yaml
--- a/deploy/k8s/base/kustomization.yaml
+++ b/deploy/k8s/base/kustomization.yaml
@ -0,0 +1,9 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 namespace: orderbooks
 resources:
  - namespace.yaml
  - configmap.yaml
  - pvc.yaml
  - deployment-collector.yaml
  - cronjob-uploader.yaml
--- a/deploy/k8s/base/namespace.yaml
+++ b/deploy/k8s/base/namespace.yaml
@ -0,0 +1,7 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: orderbooks
  labels:
    app.kubernetes.io/name: orderbooks
    app.kubernetes.io/part-of: orderbooks
--- a/deploy/k8s/base/pvc.yaml
+++ b/deploy/k8s/base/pvc.yaml
@ -0,0 +1,15 @@
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: orderbooks-data
  namespace: orderbooks
  labels:
    app.kubernetes.io/name: orderbooks
    app.kubernetes.io/part-of: orderbooks
 spec:
  accessModes:
    - ReadWriteOnce
  storageClassName: local-path
  resources:
    requests:
      storage: 10Gi
--- a/docs/DATA_CONTRACT.md
+++ b/docs/DATA_CONTRACT.md
@ -0,0 +1,168 @@
 # Data Contract
 The archive is raw-first. Raw market data must be preserved before normalization, aggregation, upload, or analysis.
 ## Storage Principles
 - Store the raw response payload exactly as received whenever practical.
 - Add collector metadata beside the raw payload, not inside it.
 - Use UTC timestamps in ISO 8601 format with a `Z` suffix.
 - Use gzip JSONL for high-frequency snapshot data.
 - Rotate live collection files by hour or run.
 - Include checksums in manifests for all closed files.
 - Keep normalized files derived and traceable back to raw files.
 - Never store secrets, cookies, private keys, wallet material, or authenticated session state.
 ## Directory Layout
 Initial expected layout:
 ```text
 data/
  probes/
  discovery/
  live_sample/
  normalized_sample/
  manifests/
 reports/
  checkpoints/
 ```
 Future sustained collection layout:
 ```text
 data/
  raw/
    polymarket/
      orderbooks/
        YYYY/
          MM/
            DD/
              HH/
                polymarket_orderbooks_YYYYMMDDTHHMMSSZ.jsonl.gz
  normalized/
    polymarket/
      orderbooks/
        YYYY/
          MM/
            DD/
              polymarket_orderbooks_normalized_YYYYMMDD.jsonl.gz
  manifests/
 ```
 Do not create a database until compressed file archives are proven painful.
 ## Raw Orderbook Snapshot Envelope
 Checkpoint 4 should store one JSON object per line using this envelope or a documented successor:
 ```json
 {
  "schema_name": "raw_orderbook_snapshot",
  "schema_version": 1,
  "collector": {
    "name": "polymarket_orderbook_collector",
    "version": "0.1.0"
  },
  "market": {
    "market_name": "polymarket",
    "market_slug": "example-slug",
    "condition_id": "0x...",
    "token_id": "123",
    "outcome": "Yes"
  },
  "collection": {
    "collected_at_utc": "2026-04-14T20:53:49Z",
    "sequence": 1
  },
  "request": {
    "method": "GET",
    "url": "https://example.invalid/orderbook",
    "params": {
      "token_id": "123"
    },
    "status_code": 200,
    "duration_ms": 123
  },
  "raw": {}
 }
 ```
 `raw` is the unmodified response payload. If the endpoint returns text or bytes, record encoding and store a lossless representation.
 ## Discovery Record Fields
 Checkpoint 3 normalized market records should include:
 - `market_name`
 - `market_slug`
 - `title` or `question`
 - `condition_id`
 - `tokens`
 - `outcomes`
 - `start_time_utc`, if available
 - `end_time_utc`, if available
 - `active`
 - `closed`
 - `endpoint_source`
 - `fetched_at_utc`
 - `raw_ref`
 `tokens` should preserve the mapping between outcome labels and token IDs.
 ## Normalized Snapshot Fields
 Checkpoint 5 normalized records should include:
 - `market_name`
 - `market_slug`
 - `condition_id`
 - `token_id`
 - `outcome`
 - `collected_at_utc`
 - `best_bid`
 - `best_ask`
 - `spread`
 - `midpoint`
 - `bid_depth_total`
 - `ask_depth_total`
 - `bid_depth_within_1c`
 - `ask_depth_within_1c`
 - `bid_depth_within_2c`
 - `ask_depth_within_2c`
 - `bid_depth_within_5c`
 - `ask_depth_within_5c`
 - `raw_file`
 - `raw_line_number`, when feasible
 Normalized data is invalid if it cannot reference the raw source record.
 ## Manifest Requirements
 Collection and transformation manifests should include:
 - manifest schema name and version
 - checkpoint or process name
 - start and end timestamps
 - market names and market IDs tracked
 - input files
 - output files
 - request counts
 - success and failure counts
 - status-code counts
 - row counts
 - checksums for closed files
 - command used
 - config path or config digest
 - warnings and known gaps
 - gate status
 Checksums should use SHA-256 unless a later report explains why another hash is used.
 ## Timestamp Policy
 - `collected_at_utc`: local collector timestamp taken as close as possible to receipt of data.
 - `fetched_at_utc`: timestamp for metadata or discovery fetches.
 - Endpoint-provided timestamps must be preserved under their original field names in `raw`.
 - If endpoint timestamp semantics are unclear, write the ambiguity into the probe report.
--- a/docs/GOOGLE_DRIVE_OFFLOAD.md
+++ b/docs/GOOGLE_DRIVE_OFFLOAD.md
@ -0,0 +1,294 @@
 # Google Drive Offload
 Status: valid
 This document covers Checkpoint 7: offloading closed raw collector files and
 manifests to Google Drive with `rclone`.
 This checkpoint does not prove production readiness or 24/7 reliability. A real
 small upload must be run with a configured remote, and the later 24h soak test
 must still pass.
 ## Scope
 Included:
 - `scripts/upload_archive_rclone.sh`
 - `systemd/polymarket-orderbook-uploader.service`
 - `systemd/polymarket-orderbook-uploader.timer`
 - dry-run mode by default
 - real upload only with `--execute`
 - rclone verification with `rclone check`
 - per-run upload manifests
 - optional local cleanup only after successful verification
 Excluded:
 - dashboards
 - databases
 - strategies or backtests
 - trading, signing, order placement, or wallet logic
 - hardcoded private auth material
 ## Install rclone
 On Ubuntu or Debian:
 ```sh
 sudo apt-get update
 sudo apt-get install -y rclone
 ```
 Confirm:
 ```sh
 rclone version
 ```
 ## Configure A Google Drive Remote
 Configure the remote outside this repository. For a service-user setup:
 ```sh
 sudo -u orderbooks rclone config
 sudo -u orderbooks rclone lsd gdrive:
 ```
 The example remote path is:
 ```text
 gdrive:orderbooks/polymarket
 ```
 Any valid `rclone` destination may be used. The uploader reads it from:
 ```text
 ORDERBOOKS_RCLONE_DEST
 ```
 For systemd, create:
 ```text
 /etc/orderbooks/orderbook-uploader.env
 ```
 Example:
 ```text
 ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
 ```
 Do not commit the machine-local rclone config or any private auth material.
 ## What Gets Uploaded
 By default the script targets:
 | Source | Default path |
 | --- | --- |
 | raw collector files | `/var/lib/orderbooks/raw_orderbooks` |
 | collector manifests | `/var/lib/orderbooks/manifests` |
 It does not target normalized sample files by default.
 Files modified within the last 10 minutes are skipped to avoid active collector
 files:
 ```text
 ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600
 ```
 The script preserves repository/data-directory relative paths on the remote. For
 example:
 ```text
 /var/lib/orderbooks/raw_orderbooks/polymarket/orderbooks/<run_id>/file.jsonl.gz
 ```
 uploads to:
 ```text
 <remote>/raw_orderbooks/polymarket/orderbooks/<run_id>/file.jsonl.gz
 ```
 ## Dry Run
 Dry-run is the default. It plans files, stages a temporary copy, invokes
 `rclone copy --dry-run`, and writes an upload manifest.
 Example for a VPS:
 ```sh
 /opt/orderbooks/scripts/upload_archive_rclone.sh \
  --data-dir /var/lib/orderbooks \
  --dest "$ORDERBOOKS_RCLONE_DEST"
 ```
 Example against the repository sample data:
 ```sh
 scripts/upload_archive_rclone.sh \
  --data-dir data \
  --dest gdrive:orderbooks/polymarket/checkpoint7-test \
  --manifest-path data/manifests/upload_archive_real_test_dry_run_manifest.json \
  --min-age-seconds 0 \
  --rclone-bin /usr/bin/rclone
 ```
 Dry-run does not prove remote write access.
 ## Execute Upload
 Run a real upload only after the remote is configured and the dry-run plan looks
 right:
 ```sh
 /opt/orderbooks/scripts/upload_archive_rclone.sh \
  --execute \
  --data-dir /var/lib/orderbooks \
  --dest "$ORDERBOOKS_RCLONE_DEST"
 ```
 The script runs:
 ```text
 rclone copy <staged files> <remote> --checksum
 rclone check <staged files> <remote> --one-way --checksum
 ```
 The upload gate is `PASS` only when the copy succeeds and verification succeeds.
 ## Retention And Cleanup
 Local files are kept by default, even after upload verification.
 Cleanup requires an explicit flag:
 ```sh
 /opt/orderbooks/scripts/upload_archive_rclone.sh \
  --execute \
  --cleanup-after-verify \
  --retention-days 7 \
  --data-dir /var/lib/orderbooks \
  --dest "$ORDERBOOKS_RCLONE_DEST"
 ```
 Cleanup deletes only files that were selected for upload, uploaded, verified, and
 older than the retention window. The default retention window is 7 days.
 ## Upload Manifest
 Each run writes a manifest such as:
 ```text
 /var/lib/orderbooks/manifests/upload_archive_YYYYMMDDTHHMMSSZ.json
 ```
 The manifest records:
 - planned files
 - attempted files
 - dry-run files
 - uploaded files
 - verified files
 - skipped open or recent files
 - retained local files
 - deleted local files
 - SHA-256 checksums
 - command mode
 - start/end time
 - rclone copy/check exit codes
 - gate status
 For this repository, the sample manifest path is:
 ```text
 data/manifests/upload_archive_sample_manifest.json
 ```
 The verified Checkpoint 7 real-test manifest is:
 ```text
 data/manifests/upload_archive_real_test_manifest.json
 ```
 ## systemd Timer
 Install the unit files:
 ```sh
 sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.service /etc/systemd/system/polymarket-orderbook-uploader.service
 sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.timer /etc/systemd/system/polymarket-orderbook-uploader.timer
 sudo systemctl daemon-reload
 ```
 Create the environment file:
 ```sh
 sudo install -o root -g orderbooks -m 0640 /dev/null /etc/orderbooks/orderbook-uploader.env
 sudo editor /etc/orderbooks/orderbook-uploader.env
 ```
 At minimum, set:
 ```text
 ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
 ```
 Enable the timer:
 ```sh
 sudo systemctl enable --now polymarket-orderbook-uploader.timer
 ```
 Run one upload immediately:
 ```sh
 sudo systemctl start polymarket-orderbook-uploader.service
 ```
 ## Logs
 Use the systemd journal:
 ```sh
 sudo systemctl status polymarket-orderbook-uploader.service
 sudo journalctl -u polymarket-orderbook-uploader.service -f
 sudo systemctl list-timers polymarket-orderbook-uploader.timer
 ```
 ## Current Checkpoint 7 Result
 Initial local validation was blocked when `rclone` was unavailable. That blocked
 manifest remains at:
 ```text
 data/manifests/upload_archive_sample_manifest.json
 ```
 After `rclone` was configured as `/usr/bin/rclone` with remote `gdrive:`, a dry
 run and one tiny real upload were run against:
 ```text
 gdrive:orderbooks/polymarket/checkpoint7-test
 ```
 The real upload manifest records `rclone copy` exit code 0 and `rclone check`
 exit code 0:
 ```text
 data/manifests/upload_archive_real_test_manifest.json
 ```
 Current gate:
 ```text
 PASS
 ```
 ## What Remains Unproven
 - Long-run upload reliability.
 - Interaction between hourly uploads and a 24h collector soak test.
 - Retention cleanup after verified upload.
 - Production readiness.
--- a/docs/KUBERNETES_DEPLOYMENT.md
+++ b/docs/KUBERNETES_DEPLOYMENT.md
@ -0,0 +1,148 @@
 # Kubernetes Deployment
 Status: draft runtime package for Checkpoint 8G
 This document describes the Kubernetes package for the Polymarket raw
 order-book collector. It follows the shared Hetzner k3s cluster model from
 `../nuri/unrip3`: application code, Dockerfile, manifests, and Forgejo workflow
 live in this repository; platform services, the shared registry, and the shared
 Forgejo runner remain platform-owned.
 This package does not claim production readiness. Production readiness still
 requires a real Kubernetes runtime smoke run with preserved evidence.
 ## Cluster Decisions
 - Namespace: `orderbooks`
 - Workstation kubeconfig for validation: `../nuri/unrip3/.state/hetzner/kubeconfig.yaml`
 - Shared registry and shared Forgejo runner
 - Existing rclone Secret: `orderbooks/orderbooks-rclone-config`
 - Secret key mounted by the uploader: `rclone.conf`
 Do not commit or print rclone config contents.
 ## Runtime Layout
 The collector and uploader share one PVC:
 ```text
 PVC: orderbooks-data
 mount: /var/lib/orderbooks
 raw files: /var/lib/orderbooks/raw_orderbooks
 manifests: /var/lib/orderbooks/manifests
 discovery: /var/lib/orderbooks/discovery
 ```
 The collector uses one Deployment with one replica. The container runs
 `/app/scripts/run_polymarket_collector_loop.sh`, which repeatedly executes the
 existing bounded collector cycle and records loop failure/interruption manifests
 instead of relying on Kubernetes crash loops for normal operation.
 The uploader uses one CronJob. It runs the existing rclone uploader in execute
 mode, mounts the same PVC, mounts `orderbooks-rclone-config` read-only at
 `/etc/rclone/rclone.conf`, sets `RCLONE_CONFIG` to that file, and uploads only
 closed/aged files.
 ## Bootstrap This App Repo
 Run the orderbooks-specific bootstrap from this repository:
 ```sh
 scripts/deploy/bootstrap_orderbooks_k8s.sh
 ```
 The bootstrap loads platform defaults and resolved secrets from the local
 platform state without printing secret values. It ensures namespace `orderbooks`,
 creates or updates `orderbooks-registry-creds`, verifies the existing
 `orderbooks-rclone-config` secret has key `rclone.conf`, creates or updates the
 Forgejo repo `philipp/orderbooks`, and upserts the required Actions secret and
 variables.
 After bootstrap, push a clean source tree to Forgejo `main`. Do not push local
 `data/`, `artifacts/`, `reports/`, `orchestration/`, kubeconfigs, rclone config,
 `.env`, private keys, or other local evidence/secrets.
 ## Image Build And Deploy
 The Forgejo workflow is `.forgejo/workflows/deploy.yml`. It follows the shared
 runner pattern:
 1. load `KUBECONFIG_B64` from Forgejo secrets;
 2. clone this repo inside the runner;
 3. create an in-cluster Kaniko Job;
 4. build and push `REGISTRY_HOST/orderbooks:<git-sha>`;
 5. apply `deploy/k8s/base` with the built image;
 6. wait for `deployment/orderbooks-collector` rollout.
 Required Forgejo repo secret:
 ```text
 KUBECONFIG_B64
 ```
 Required Forgejo repo variable:
 ```text
 REGISTRY_HOST
 ```
 Project defaults used by the workflow:
 ```text
 PROJECT_NAME=orderbooks
 PROJECT_NAMESPACE=orderbooks
 PROJECT_DEPLOYMENTS=orderbooks-collector
 PROJECT_REGISTRY_SECRET_NAME=orderbooks-registry-creds
 ```
 The registry pull/build secret `orderbooks-registry-creds` must exist in the
 `orderbooks` namespace before the workflow builds and deploys.
 ## Pre-Deploy Validation
 From this repository:
 ```sh
 bash -n scripts/run_polymarket_collector_loop.sh
 bash -n scripts/k8s_runtime_smoke_check.sh
 kubectl kustomize deploy/k8s/base
 KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml   kubectl apply -k deploy/k8s/base --dry-run=server
 KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml   kubectl -n orderbooks get secret orderbooks-rclone-config   -o go-template='{{if index .data "rclone.conf"}}rclone_secret_key_present{{else}}rclone_secret_key_missing{{end}}{{"\n"}}'
 ```
 The last command checks only whether the key exists. It must not print secret
 data.
 ## Runtime Smoke Gate
 After the image is built and the workload is actually deployed, run:
 ```sh
 KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml   scripts/k8s_runtime_smoke_check.sh   --namespace orderbooks   --deployment orderbooks-collector   --cronjob orderbooks-uploader   --raw-dir /var/lib/orderbooks/raw_orderbooks   --manifest-dir /var/lib/orderbooks/manifests   --wait-seconds 1800 \
  --upload-min-age-seconds 600
 ```
 The smoke gate uses `kubectl`, not systemd. It writes local JSON evidence under
 `data/manifests/k8s_runtime_smoke_<UTC_TIMESTAMP>.json` by default. It verifies:
 - collector pod is running;
 - latest collector manifest has `gate_status: PASS`, `rows_written > 0`, and
  `failure_count: 0`;
 - raw gzip JSONL parses and is under `/var/lib/orderbooks/raw_orderbooks`;
 - deleting the collector pod does not corrupt the old raw file checksum or row
  count;
 - a later post-restart collector cycle writes valid rows;
 - an uploader Job created from the CronJob completes;
 - the latest upload manifest records a verified rclone upload with at least one
  verified file.
 A failed smoke run still writes JSON evidence and exits nonzero. Preserve failed
 manifests, raw files, upload manifests, and pod logs for review.
 ## Not Included
 - No trading, signing, wallets, private keys, or API keys.
 - No dashboard, database, strategy, backtest, or second-market connector.
 - No websocket rewrite.
 - No rclone config contents in this repository.
--- a/docs/METHODOLOGY.md
+++ b/docs/METHODOLOGY.md
@ -0,0 +1,104 @@
 # Methodology
 This project uses checkpoint-driven compound engineering. The point is to preserve useful data and operational learning, not to accumulate scaffolding.
 ## Checkpoint Cycle
 Every checkpoint follows the same loop:
 1. Define the smallest useful checkpoint.
 2. Build only what is required for that checkpoint.
 3. Validate with real commands and real data when applicable.
 4. Write durable artifacts: code or docs, config or run instructions, manifest/report, and validation evidence.
 5. State `PASS`, `FAIL`, or `BLOCKED`.
 6. Identify the strongest fake-progress risk.
 7. Recommend the next smallest step.
 8. Stop and ask only when a real decision is needed.
 ## Gate States
 - `PASS`: the checkpoint pass condition is met and evidence is on disk.
 - `FAIL`: the checkpoint was attempted but did not meet its pass condition.
 - `BLOCKED`: work cannot continue without a decision, credential, service, or unavailable dependency.
 - `PARTIAL`: useful artifacts exist, but the checkpoint should not be treated as passed.
 ## Evidence Rules
 - Evidence must be reproducible from files and commands, not just chat.
 - If a command was used to validate behavior, record the command and summarize the result in a report or manifest.
 - If data was collected, preserve raw data and include checksums.
 - If synthetic or sample data is used, label it explicitly.
 - If a claim depends on a public endpoint, record the endpoint, request parameters, response fields, status codes, timestamps, and fetch time.
 - Do not claim reliability from a short sample run. Reliability requires the roadmap soak test.
 ## Machine-Readable Manifest Format
 Checkpoint manifests should be JSON and stored under `data/manifests/`. Use this shape unless a later checkpoint documents a better schema:
 ```json
 {
  "checkpoint_id": 1,
  "checkpoint_name": "Project Scaffold And Methodology",
  "status": "PASS",
  "started_at_utc": "2026-04-14T20:53:49Z",
  "ended_at_utc": "2026-04-14T20:53:49Z",
  "scope": "Durable project rules and roadmap only; no collector implementation.",
  "artifacts": [
    {
      "path": "AGENTS.md",
      "kind": "project_rules",
      "status": "valid"
    }
  ],
  "validation": {
    "commands": [
      {
        "command": "git status --short",
        "result": "completed"
      }
    ],
    "summary": "Required files exist and contain checkpoint rules."
  },
  "decisions": [],
  "assumptions": [],
  "fake_progress_risk": "Most progress is documentation until public Polymarket endpoint behavior is proven.",
  "next_step": "Run Checkpoint 2 public source probe."
 }
 ```
 ## Markdown Checkpoint Report Format
 Checkpoint reports should be stored under `reports/checkpoints/` and include:
 - active checkpoint
 - scope
 - files created or changed
 - validation commands and results
 - project rules or operational lessons added
 - pass/fail/gate
 - strongest fake-progress risk
 - next smallest step
 ## Deprecated Or Misleading Artifacts
 Do not delete mistakes. Preserve the original artifact and label it.
 Preferred labels:
 - Add a manifest entry with `status: "deprecated"` or `status: "invalid"`.
 - Add a sibling note named `<artifact>.deprecated.md` or `<artifact>.invalid.md` when a human explanation is useful.
 - Include why the artifact is wrong, when it was labeled, who labeled it, and what replaces it.
 If an artifact is dangerous because it contains secrets, stop and ask the user. Do not spread or copy the secret into reports.
 ## Anti-Fake-Progress Rules
 - No dashboard before 24h data reliability.
 - No database before plain compressed files become painful.
 - No strategy, backtest, optimizer, or trading bot code.
 - No private-key or signing code.
 - No generic multi-market abstraction before a second market exists.
 - No "production-ready" claim before a 24h soak test.
 - No endpoint assumptions without probe evidence.
 - No normalized dataset that cannot trace back to raw records.
--- a/docs/OPERATIONS.md
+++ b/docs/OPERATIONS.md
@ -0,0 +1,93 @@
 # Operations
 This document defines operational rules before the collector exists. It should be updated with exact commands as checkpoints add scripts, services, and upload jobs.
 ## Current Operational Status
 - Collector implementation: not started.
 - Supported market: none yet; Polymarket is the first planned market.
 - Deployment target: small VPS.
 - Offload target: Google Drive through `rclone`.
 - Reliability status: not production-ready until a documented 24h soak test passes.
 ## Safety Rules
 - No trading.
 - No order placement.
 - No wallet signing.
 - No private keys.
 - No secrets in git.
 - No dashboards, databases, ML, or strategy code before the roadmap gate allows them.
 ## Local Runtime Principles
 Future scripts should:
 - accept a configurable data directory
 - write logs to a predictable location
 - write raw gzip JSONL snapshots
 - rotate files by hour or run
 - close files cleanly on shutdown
 - write manifests after runs
 - avoid corrupting closed files on restart
 - handle public endpoint errors and rate limits conservatively
 ## VPS Deployment Principles
 Checkpoint 6 should document:
 - Python version and virtualenv setup
 - package installation
 - environment variables
 - systemd or Docker Compose runtime
 - service user and file permissions
 - data directory ownership
 - log locations
 - restart policy
 - disk usage checks
 - safe upgrade and rollback steps
 ## Google Drive Offload Principles
 Checkpoint 7 should use `rclone` and must:
 - avoid hardcoded credentials
 - upload only closed or rotated files
 - support dry-run mode
 - verify upload success
 - preserve local files until upload is verified
 - maintain checksums
 - keep the last N days locally
 - write an upload manifest
 ## Incident And Bad-Data Handling
 If data looks wrong:
 1. Preserve the raw files.
 2. Stop relying on the affected derived files.
 3. Label the artifact `invalid` or `deprecated`.
 4. Write a short note explaining the issue and replacement, if any.
 5. Keep the learning in docs or reports.
 Examples of bad-data conditions:
 - endpoint returned a schema different from expected
 - token/outcome mapping was wrong
 - timestamps were misunderstood
 - rate limits caused large gaps
 - gzip file was not closed cleanly
 - upload succeeded but checksum did not match
 ## Minimum Reliability Claim
 A short sample run can prove that code writes files. It cannot prove 24/7 reliability.
 The project may only claim production readiness after:
 - discovery works
 - raw order-book collection works
 - offload works
 - 24h soak test completes
 - data quality and gap metrics are documented
--- a/docs/ORDERBOOK_SCHEMA.md
+++ b/docs/ORDERBOOK_SCHEMA.md
@ -0,0 +1,102 @@
 # Orderbook Snapshot Schema
 Status: valid
 This document covers the Checkpoint 5 normalized order-book sample. The raw
 gzip JSONL files remain the source of truth. Normalized rows are derived records
 for quick inspection and later quality checks.
 ## Normalized Snapshot
 Schema name: `normalized_orderbook_snapshot`
 Schema version: `1`
 File format: gzip JSONL, one JSON object per line.
 Sample location:
 ```text
 data/normalized_sample/polymarket/orderbooks/<run_id>/polymarket_orderbooks_normalized_<run_id>.jsonl.gz
 ```
 Every normalized row must reference exactly one raw gzip JSONL source row:
 - `raw_file`: repository-relative path to the raw gzip JSONL file.
 - `raw_line_number`: 1-based line number inside that raw gzip JSONL file.
 Derived data is invalid if either lineage field is missing or points to a
 missing raw file.
 ## Field Contract
 | Field | Type | Meaning |
 | --- | --- | --- |
 | `schema_name` | string | Always `normalized_orderbook_snapshot`. |
 | `schema_version` | number | Schema version, currently `1`. |
 | `market_name` | string | Market source name from the raw envelope. |
 | `market_slug` | string | Polymarket market slug from the raw envelope. |
 | `condition_id` | string | Polymarket condition ID from the raw envelope. |
 | `token_id` | string | Polymarket CLOB token ID from the raw envelope. |
 | `outcome` | string | Outcome label associated with `token_id`. |
 | `collected_at_utc` | string | Collector timestamp from the raw envelope. |
 | `best_bid` | string or null | Maximum bid price, or null when no bids exist. |
 | `best_ask` | string or null | Minimum ask price, or null when no asks exist. |
 | `spread` | string or null | `best_ask - best_bid` when both sides exist. |
 | `midpoint` | string or null | `(best_bid + best_ask) / 2` when both sides exist. |
 | `bid_depth_total` | string | Sum of all bid sizes. |
 | `ask_depth_total` | string | Sum of all ask sizes. |
 | `bid_depth_within_1c` | string | Sum of bid sizes priced at least `best_bid - 0.01`. |
 | `ask_depth_within_1c` | string | Sum of ask sizes priced at most `best_ask + 0.01`. |
 | `bid_depth_within_2c` | string | Sum of bid sizes priced at least `best_bid - 0.02`. |
 | `ask_depth_within_2c` | string | Sum of ask sizes priced at most `best_ask + 0.02`. |
 | `bid_depth_within_5c` | string | Sum of bid sizes priced at least `best_bid - 0.05`. |
 | `ask_depth_within_5c` | string | Sum of ask sizes priced at most `best_ask + 0.05`. |
 | `raw_file` | string | Repository-relative raw gzip JSONL path. |
 | `raw_line_number` | number | 1-based source line number in `raw_file`. |
 ## Numeric Encoding
 Prices and sizes are parsed with Python `Decimal`. Derived numeric values are
 emitted as exact decimal strings rather than JSON numbers. This keeps precision
 visible and avoids binary floating-point rounding.
 Missing price-derived values are emitted as `null`. Depth totals and depth bands
 are emitted as decimal strings and use `"0"` when the relevant side is empty.
 ## Calculation Rules
 - `best_bid`: maximum bid price.
 - `best_ask`: minimum ask price.
 - `spread`: `best_ask - best_bid` when both sides exist.
 - `midpoint`: `(best_bid + best_ask) / 2` when both sides exist.
 - `bid_depth_total`: sum of all bid sizes.
 - `ask_depth_total`: sum of all ask sizes.
 - `bid_depth_within_1c`: sum bid sizes with price greater than or equal to
  `best_bid - 0.01`.
 - `ask_depth_within_1c`: sum ask sizes with price less than or equal to
  `best_ask + 0.01`.
 - The same band rule is used for `0.02` and `0.05`.
 ## Sanity Rules
 A normalized file should pass these checks:
 - Output row count equals raw input row count unless skipped rows are recorded.
 - Every row has `raw_file` and `raw_line_number`.
 - Every referenced raw file exists.
 - `spread` is non-negative whenever both sides exist.
 - `midpoint` is between `best_bid` and `best_ask` whenever both sides exist.
 - Depth totals and band depths are non-negative.
 - At least one `Up` row and one `Down` row exist in the sample.
 - The gzip JSONL file decompresses and every line parses as JSON.
 - The manifest checksum matches the normalized output file.
 ## Current Known Gaps
 - This schema covers a derived sample extract only.
 - It does not define sustained daily normalized partitions.
 - It does not include upload, daemon runtime, dashboards, databases, strategy
  code, backtests, trading behavior, or wallet behavior.
 - Long-run schema stability still depends on future collection and soak-test
  evidence.
--- a/docs/POLYMARKET_COLLECTOR.md
+++ b/docs/POLYMARKET_COLLECTOR.md
@ -0,0 +1,149 @@
 # Polymarket Collector
 Artifact status: `valid`
 ## Scope
 This document covers the Checkpoint 4 bounded raw order-book sample collector.
 It does not describe a production service. It does not include normalization, upload, systemd, dashboards, databases, strategies, trading, wallet logic, private keys, API keys, or private endpoints.
 ## Inputs
 The collector reads active BTC markets from:
 ```text
 data/discovery/polymarket_btc_markets_latest.json
 ```
 Checkpoint 3 writes normalized market records with `condition_id` and `tokens` preserving the `Up` and `Down` outcome-token mapping. The collector uses only those records and does not perform market discovery itself.
 If the discovery file is stale or contains no usable active markets, run:
 ```sh
 python3 scripts/discover_polymarket_btc_markets.py
 ```
 ## Endpoint
 The sample uses the public CLOB batch order-book endpoint:
 ```text
 POST https://clob.polymarket.com/books
 ```
 Request body shape:
 ```json
 [
  {"token_id": "<up_token_id>"},
  {"token_id": "<down_token_id>"}
 ]
 ```
 No authentication is used.
 ## Running A Bounded Sample
 Default sample command:
 ```sh
 python3 scripts/collect_polymarket_orderbooks.py
 ```
 The default config is:
 ```text
 config/polymarket_collector.example.yaml
 ```
 The example config is deliberately small:
 - `market_limit: 2`
 - `interval_seconds: 30`
 - `duration_seconds: 300`
 - `market_end_safety_seconds: 420`
 This produces a 5-minute sample for at most 2 markets, fetching both `Up` and `Down` outcome tokens by batch request.
 ## Outputs
 Raw gzip JSONL snapshots are written under:
 ```text
 data/live_sample/polymarket/orderbooks/<run_id>/
 ```
 The sample manifest is written to:
 ```text
 data/manifests/orderbook_collector_sample_manifest.json
 ```
 Files rotate by run for this checkpoint. Hourly rotation is intentionally left for a later sustained runtime checkpoint.
 ## Raw JSONL Envelope
 Each gzip JSONL line is a raw-first envelope:
 ```json
 {
  "schema_name": "raw_orderbook_snapshot",
  "schema_version": 1,
  "collector": {
    "name": "polymarket_orderbook_collector",
    "version": "0.1.0"
  },
  "market": {
    "market_name": "polymarket",
    "market_slug": "example",
    "condition_id": "0x...",
    "token_id": "123",
    "outcome": "Up",
    "market_end_time_utc": "2026-04-14T22:00:00Z"
  },
  "collection": {
    "collected_at_utc": "2026-04-14T21:00:00Z",
    "sequence": 1,
    "response_index": 0
  },
  "request": {
    "method": "POST",
    "url": "https://clob.polymarket.com/books",
    "params": null,
    "json_body": [{"token_id": "123"}],
    "status_code": 200,
    "duration_ms": 123,
    "attempts": []
  },
  "raw": {}
 }
 ```
 The `raw` object is the unmodified order-book object returned by CLOB for that token.
 ## Rate-Limit Handling
 The sample is conservative:
 - Uses a small market cap by default.
 - Uses a fixed interval between batch requests.
 - Applies request timeout.
 - Retries `429` and `5xx` responses with exponential backoff.
 - Does not use concurrent requests.
 ## Shutdown
 `SIGINT` and `SIGTERM` set a stop flag. The current request, if any, finishes or times out, the gzip file closes, and the manifest is written with a shutdown warning.
 ## Known Gaps
 - This is a short run-rotated sample, not a daemon.
 - It does not prove 24/7 reliability.
 - It does not implement hourly rotation.
 - It does not refresh discovery during a run.
 - It does not normalize snapshots.
 - It does not upload files.
 - It does not use websockets.
 The project must not claim production readiness until the later 24h soak test passes with documented quality metrics.
--- a/docs/PRODUCTION_DEFINITION_OF_DONE.md
+++ b/docs/PRODUCTION_DEFINITION_OF_DONE.md
@ -0,0 +1,54 @@
 # Production Definition Of Done
 Status: ACTIVE
 Defined at UTC: 2026-04-17T09:12:02Z
 This project is done for the first production milestone only when it is reliably
 collecting Polymarket BTC order-book data on a small VPS with evidence on disk.
 Packaging, docs, local samples, and local soak tests are useful evidence, but
 not the finish line.
 ## Done Means
 1. The collector runs on the VPS under systemd using `/opt/orderbooks` for code
   and `/var/lib/orderbooks` for data.
 2. Raw gzip JSONL order-book snapshots are written for active BTC up/down
   markets, with manifests beside them.
 3. The service survives a forced restart: after restart, a later collection
   cycle writes valid raw rows without corrupting prior files.
 4. Temporary network/API failure is handled as an operational failure, not data
   loss: failures are visible in logs/manifests, and the next successful cycle
   resumes writing new files.
 5. Google Drive upload runs from the VPS through `rclone`, verifies success, and
   leaves local files in place until upload is confirmed.
 6. A final production report and machine-readable manifest record exact commands,
   timestamps, files, checksums, restart result, upload result, and remaining
   risks.
 ## Not Required For This Milestone
 - No second market.
 - No dashboard.
 - No database.
 - No strategy or backtest code.
 - No websocket rewrite unless polling proves insufficient.
 - No generic multi-market abstraction.
 ## Maximum Remaining Builder Turns
 The remaining work is capped at three builder turns:
 1. Accept deploy bundle and prepare the minimal VPS reliability gate.
 2. Execute or guide the VPS cutover and collect runtime evidence.
 3. Fix only blocking production issues found by the VPS gate, then write the
   final pass/fail report.
 If actual VPS access is unavailable, the gate must be `BLOCKED_NEEDS_VPS_ACCESS`,
 not production ready.
 ## Current Evidence
 - Deploy bundle gate: `DEPLOY_BUNDLE_READY`.
 - Local 24h soak final manifest exists but remains `NEEDS_REVIEW`.
 - Production readiness remains false until VPS runtime evidence exists.
--- a/docs/VPS_CUTOVER_RUNBOOK.md
+++ b/docs/VPS_CUTOVER_RUNBOOK.md
@ -0,0 +1,341 @@
 # VPS Cutover Runbook
 Status: valid
 Checkpoint 8 status is `WAIVED_BY_USER`, not `PASS`. This runbook prepares a
 VPS cutover for the existing Polymarket raw order-book collector only. It does
 not claim production readiness, second-market support, dashboards, databases,
 strategies, or trading.
 ## Scope
 Included:
 - VPS prerequisite checks.
 - Repository copy/update steps.
 - Public Polymarket collector service install.
 - Google Drive offload timer install with rclone.
 - Liveness, cycle health, and upload verification commands.
 - Rollback and stop commands.
 Excluded:
 - Private API access.
 - Wallets, keys, mnemonics, signing, order placement, or trading.
 - Database, dashboard, strategy, or second-market work.
 ## Recommended VPS Layout
 Use the existing package paths unless the VPS has a reason to differ:
 ```text
 repository: /opt/orderbooks
 python virtualenv: /opt/orderbooks/.venv
 config: /etc/orderbooks/polymarket_collector.vps.yaml
 collector env: /etc/orderbooks/polymarket-orderbook-collector.env
 uploader env: /etc/orderbooks/orderbook-uploader.env
 data root: /var/lib/orderbooks
 raw files: /var/lib/orderbooks/raw_orderbooks
 manifests: /var/lib/orderbooks/manifests
 discovery: /var/lib/orderbooks/discovery
 ```
 The `orderbooks` system user should own `/var/lib/orderbooks`. The repository
 under `/opt/orderbooks` can be root-owned and world-readable.
 ## VPS Prerequisites
 On Ubuntu or Debian:
 ```sh
 sudo apt-get update
 sudo apt-get install -y git python3 python3-venv rclone
 sudo useradd --system --home /var/lib/orderbooks --shell /usr/sbin/nologin orderbooks || true
 sudo mkdir -p /opt /etc/orderbooks /var/lib/orderbooks/discovery /var/lib/orderbooks/raw_orderbooks /var/lib/orderbooks/manifests /var/log/orderbooks
 sudo chown -R orderbooks:orderbooks /var/lib/orderbooks /var/log/orderbooks
 ```
 No API keys, private keys, mnemonics, wallets, or trading credentials are
 required by this project. rclone credentials are the only machine-local
 credential material expected for Google Drive offload, and they must stay
 outside the repository.
 ## Copy Or Update The Repository
 First install:
 ```sh
 cd /opt
 sudo git clone <repo-url> orderbooks
 ```
 Update an existing checkout:
 ```sh
 cd /opt/orderbooks
 sudo git fetch --all --prune
 sudo git pull --ff-only
 ```
 Prepare repository permissions and the Python virtualenv:
 ```sh
 cd /opt/orderbooks
 sudo chmod +x scripts/run_polymarket_collector_cycle.sh scripts/upload_archive_rclone.sh scripts/vps_preflight_check.sh scripts/vps_runtime_smoke_check.sh
 sudo python3 -m venv .venv
 sudo .venv/bin/python -m pip install --upgrade pip
 sudo chown -R root:root /opt/orderbooks
 sudo chmod -R a+rX /opt/orderbooks
 ```
 The current collector scripts use the Python standard library.
 ## Configure Public Collector Runtime
 Install the example config, then review it:
 ```sh
 sudo install -o root -g root -m 0644 /opt/orderbooks/config/polymarket_collector.vps.example.yaml /etc/orderbooks/polymarket_collector.vps.yaml
 sudo editor /etc/orderbooks/polymarket_collector.vps.yaml
 ```
 Optional collector env overrides:
 ```sh
 sudo install -o root -g orderbooks -m 0640 /dev/null /etc/orderbooks/polymarket-orderbook-collector.env
 sudo editor /etc/orderbooks/polymarket-orderbook-collector.env
 ```
 Example values:
 ```text
 ORDERBOOKS_DATA_DIR=/var/lib/orderbooks
 ORDERBOOKS_OUTPUT_DIR=/var/lib/orderbooks/raw_orderbooks
 ORDERBOOKS_DISCOVERY_MAX_PAGES=3
 ```
 ## Configure Rclone
 Configure rclone as the `orderbooks` user. Do not print or commit
 `rclone.conf`.
 ```sh
 sudo -u orderbooks rclone config
 sudo -u orderbooks rclone listremotes
 sudo -u orderbooks rclone lsf gdrive: --max-depth 1
 ```
 Create the uploader env file:
 ```sh
 sudo install -o root -g orderbooks -m 0640 /dev/null /etc/orderbooks/orderbook-uploader.env
 sudo editor /etc/orderbooks/orderbook-uploader.env
 ```
 Example:
 ```text
 ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
 ORDERBOOKS_RCLONE_BIN=/usr/bin/rclone
 ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600
 ```
 The uploader verifies uploads with `rclone check`. Dry runs do not prove remote
 write access.
 ## Run VPS Preflight
 Run the preflight before installing or starting services:
 ```sh
 cd /opt/orderbooks
 sudo -u orderbooks /opt/orderbooks/scripts/vps_preflight_check.sh \
  --app-dir /opt/orderbooks \
  --python-bin /opt/orderbooks/.venv/bin/python \
  --rclone-bin /usr/bin/rclone \
  --rclone-remote gdrive:orderbooks/polymarket \
  --data-dir /var/lib/orderbooks \
  --manifest-dir /var/lib/orderbooks/manifests \
  --log-dir /var/log/orderbooks \
  --min-free-gib 5
 ```
 The preflight does not print rclone configuration. It checks repository files,
 Python compilation, shell syntax, systemd unit parsing when available, rclone
 availability, optional remote readability, target directory writability, disk
 space, and the absence of required project secrets.
 ## Install Systemd Units
 Install collector and uploader units:
 ```sh
 sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-collector.service /etc/systemd/system/polymarket-orderbook-collector.service
 sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.service /etc/systemd/system/polymarket-orderbook-uploader.service
 sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.timer /etc/systemd/system/polymarket-orderbook-uploader.timer
 sudo systemctl daemon-reload
 sudo systemd-analyze verify /etc/systemd/system/polymarket-orderbook-collector.service /etc/systemd/system/polymarket-orderbook-uploader.service /etc/systemd/system/polymarket-orderbook-uploader.timer
 ```
 Enable and start:
 ```sh
 sudo systemctl enable --now polymarket-orderbook-collector.service
 sudo systemctl enable --now polymarket-orderbook-uploader.timer
 ```
 Run one uploader cycle immediately after the collector has produced closed raw
 files:
 ```sh
 sudo systemctl start polymarket-orderbook-uploader.service
 ```
 Run the minimal runtime reliability smoke gate after both units are installed,
 rclone is configured, and at least one closed raw file is older than the
 uploader minimum age (default: 600 seconds):
 ```sh
 sudo /opt/orderbooks/scripts/vps_runtime_smoke_check.sh \
  --app-dir /opt/orderbooks \
  --data-dir /var/lib/orderbooks \
  --raw-dir /var/lib/orderbooks/raw_orderbooks \
  --manifest-dir /var/lib/orderbooks/manifests \
  --collector-service polymarket-orderbook-collector.service \
  --uploader-service polymarket-orderbook-uploader.service \
  --wait-seconds 900
 ```
 This command is the minimal production reliability gate. It records a JSON
 evidence manifest under `/var/lib/orderbooks/manifests/`, verifies a valid
 collector cycle, forces one collector service restart, verifies the prior raw
 gzip file still parses with the same checksum, waits for a later valid cycle,
 starts the uploader, and records upload success or failure evidence. Preserve
 failed smoke manifests and journal logs for review.
 ## Check Liveness
 Collector service:
 ```sh
 sudo systemctl status polymarket-orderbook-collector.service
 sudo journalctl -u polymarket-orderbook-collector.service --since "30 minutes ago"
 ```
 Uploader timer and service:
 ```sh
 sudo systemctl list-timers polymarket-orderbook-uploader.timer
 sudo systemctl status polymarket-orderbook-uploader.service
 sudo journalctl -u polymarket-orderbook-uploader.service --since "2 hours ago"
 ```
 Recent artifacts:
 ```sh
 find /var/lib/orderbooks/raw_orderbooks -type f -name '*.jsonl.gz' -printf '%TY-%Tm-%TdT%TH:%TM:%TS %s %p\n' | sort | tail
 find /var/lib/orderbooks/manifests -type f -name '*.json' -printf '%TY-%Tm-%TdT%TH:%TM:%TS %s %p\n' | sort | tail
 ```
 ## Check Latest Cycle Health
 Inspect the newest collector manifest:
 ```sh
 latest_collector="$(find /var/lib/orderbooks/manifests -type f -name 'polymarket_orderbook_collector_*.json' | sort | tail -n 1)"
 python3 -m json.tool "$latest_collector" | sed -n '1,180p'
 ```
 Minimum healthy signs:
 ```text
 gate_status: PASS
 rows_written: greater than 0
 failure_count: 0
 failures: []
 ```
 Verify the latest raw gzip parses and row count matches its manifest:
 ```sh
 python3 - "$latest_collector" <<'PY'
 import gzip
 import json
 import sys
 from pathlib import Path
 manifest = json.loads(Path(sys.argv[1]).read_text())
 for item in manifest.get("output_files", []):
    path = Path(item["path"])
    rows = 0
    with gzip.open(path, "rt", encoding="utf-8") as handle:
        for line in handle:
            if line.strip():
                json.loads(line)
                rows += 1
    print({"path": str(path), "rows": rows, "manifest_rows": item.get("rows"), "matches": rows == item.get("rows")})
 PY
 ```
 ## Verify Uploads
 Inspect the newest upload manifest:
 ```sh
 latest_upload="$(find /var/lib/orderbooks/manifests -type f -name 'upload_archive_*.json' | sort | tail -n 1)"
 python3 -m json.tool "$latest_upload" | sed -n '1,220p'
 ```
 Minimum healthy signs:
 ```text
 operation_status: UPLOAD_VERIFIED
 gate_status: PASS
 rclone.copy_exit_code: 0
 rclone.check_exit_code: 0
 counts.uploaded equals counts.verified
 ```
 Manual remote spot-check without printing config:
 ```sh
 sudo -u orderbooks rclone lsf "$ORDERBOOKS_RCLONE_DEST" --max-depth 2 | head
 ```
 ## Rollback Or Stop
 Stop uploader timer first:
 ```sh
 sudo systemctl disable --now polymarket-orderbook-uploader.timer
 sudo systemctl stop polymarket-orderbook-uploader.service
 ```
 Stop collector:
 ```sh
 sudo systemctl stop polymarket-orderbook-collector.service
 ```
 Disable collector if needed:
 ```sh
 sudo systemctl disable polymarket-orderbook-collector.service
 ```
 Preserve `/var/lib/orderbooks` and `/var/lib/orderbooks/manifests` for evidence.
 If an artifact is wrong, label it as invalid or deprecated in a sibling note
 rather than deleting it.
 ## Still Not Production Proven
 Because the domestic 24h soak wait was waived by the user, the following remain
 unproven:
 - A completed 24h collector run with reviewed final metrics.
 - 24h interaction between collector rotation and uploader timer.
 - VPS-specific long-run disk, network, rclone, and systemd behavior.
 - Retention cleanup behavior under verified upload load.
 Treat this as cutover preparation. The VPS is not deployed until the commands
 are run on the VPS and evidence is written.
--- a/docs/VPS_DEPLOYMENT.md
+++ b/docs/VPS_DEPLOYMENT.md
@ -0,0 +1,298 @@
 # VPS Deployment
 Status: valid
 This document covers the Checkpoint 6 systemd runtime package for the raw
 Polymarket order-book collector.
 It does not claim production readiness or 24/7 reliability. That remains gated
 on the later 24h soak test.
 ## Scope
 Included:
 - systemd service for the raw collector cycle
 - Python virtualenv setup
 - service user and directory permissions
 - configurable data directory
 - discovery refresh before each collector cycle
 - journal-based logs
 - safe restart model for finite collector runs
 Excluded:
 - Google Drive offload
 - `rclone`
 - uploader scripts, services, or timers
 - normalization changes
 - dashboards
 - databases
 - strategies or backtests
 - trading, order placement, signing, or wallet logic
 Uploader service and timer units are intentionally deferred to Checkpoint 7.
 ## Runtime Model
 The systemd service runs:
 ```text
 /opt/orderbooks/scripts/run_polymarket_collector_cycle.sh
 ```
 Each cycle:
 1. Refreshes BTC market discovery into the configured data directory.
 2. Runs `scripts/collect_polymarket_orderbooks.py` once.
 3. Writes run-rotated raw gzip JSONL files.
 4. Writes a per-cycle collector manifest.
 5. Exits after the configured finite duration.
 The unit uses `Restart=always`, so systemd starts the next cycle after the prior
 cycle exits or fails.
 The example config uses a 300 second collection cycle. This is deliberately
 short because current BTC up/down markets are short-lived and the collector
 refreshes discovery only before a cycle starts. Do not increase the cycle beyond
 the practical market horizon unless the collector later learns to refresh market
 selection during a run.
 ## Paths
 Default VPS paths:
 | Purpose | Path |
 | --- | --- |
 | Application checkout | `/opt/orderbooks` |
 | Python virtualenv | `/opt/orderbooks/.venv` |
 | Service config | `/etc/orderbooks/polymarket_collector.vps.yaml` |
 | Optional env override file | `/etc/orderbooks/polymarket-orderbook-collector.env` |
 | Data directory | `/var/lib/orderbooks` |
 | Discovery artifacts | `/var/lib/orderbooks/discovery` |
 | Raw order-book output base | `/var/lib/orderbooks/raw_orderbooks` |
 | Per-cycle manifests | `/var/lib/orderbooks/manifests` |
 Adjust these paths if the repository is installed somewhere other than
 `/opt/orderbooks`.
 ## Environment Variables
 The service defines safe defaults and can load overrides from:
 ```text
 /etc/orderbooks/polymarket-orderbook-collector.env
 ```
 Supported variables:
 | Variable | Default | Meaning |
 | --- | --- | --- |
 | `ORDERBOOKS_APP_DIR` | `/opt/orderbooks` | Repository checkout path. |
 | `ORDERBOOKS_DATA_DIR` | `/var/lib/orderbooks` | Base directory for data files. |
 | `ORDERBOOKS_PYTHON` | `/opt/orderbooks/.venv/bin/python` | Python interpreter. |
 | `ORDERBOOKS_COLLECTOR_CONFIG` | `/etc/orderbooks/polymarket_collector.vps.yaml` | Collector config path. |
 | `ORDERBOOKS_DISCOVERY_DIR` | `$ORDERBOOKS_DATA_DIR/discovery` | Discovery artifact directory. |
 | `ORDERBOOKS_OUTPUT_DIR` | `$ORDERBOOKS_DATA_DIR/raw_orderbooks` | Collector output base directory. |
 | `ORDERBOOKS_MANIFEST_DIR` | `$ORDERBOOKS_DATA_DIR/manifests` | Per-cycle manifest directory. |
 | `ORDERBOOKS_DISCOVERY_LIMIT` | `100` | Gamma event page limit per discovery page. |
 | `ORDERBOOKS_DISCOVERY_MAX_PAGES` | `3` | Discovery page cap per cycle. |
 | `ORDERBOOKS_DISCOVERY_TIMEOUT` | `15` | Discovery request timeout in seconds. |
 Example override file:
 ```text
 ORDERBOOKS_DATA_DIR=/var/lib/orderbooks
 ORDERBOOKS_DISCOVERY_MAX_PAGES=3
 ```
 No API keys are required for this checkpoint.
 ## Install On Ubuntu Or Debian
 Run package and account setup as root or with `sudo`:
 ```sh
 sudo apt-get update
 sudo apt-get install -y git python3 python3-venv
 sudo useradd --system --home /var/lib/orderbooks --shell /usr/sbin/nologin orderbooks
 sudo mkdir -p /opt /etc/orderbooks /var/lib/orderbooks/discovery /var/lib/orderbooks/raw_orderbooks /var/lib/orderbooks/manifests
 ```
 Install or update the repository under `/opt/orderbooks`. One option is:
 ```sh
 cd /opt
 sudo git clone <repo-url> orderbooks
 ```
 If the checkout already exists:
 ```sh
 cd /opt/orderbooks
 sudo git pull --ff-only
 ```
 Prepare permissions:
 ```sh
 sudo chown -R root:root /opt/orderbooks
 sudo chmod -R a+rX /opt/orderbooks
 sudo chmod +x /opt/orderbooks/scripts/run_polymarket_collector_cycle.sh
 sudo chown -R orderbooks:orderbooks /var/lib/orderbooks
 ```
 Create the virtualenv:
 ```sh
 cd /opt/orderbooks
 sudo python3 -m venv .venv
 sudo .venv/bin/python -m pip install --upgrade pip
 sudo chown -R root:root .venv
 sudo chmod -R a+rX .venv
 ```
 The current Checkpoint 6 scripts use only the Python standard library.
 Install the VPS config and service unit:
 ```sh
 sudo install -o root -g root -m 0644 /opt/orderbooks/config/polymarket_collector.vps.example.yaml /etc/orderbooks/polymarket_collector.vps.yaml
 sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-collector.service /etc/systemd/system/polymarket-orderbook-collector.service
 ```
 Review `/etc/orderbooks/polymarket_collector.vps.yaml` before starting the
 service. The example writes under `/var/lib/orderbooks`.
 Enable and start:
 ```sh
 sudo systemctl daemon-reload
 sudo systemctl enable --now polymarket-orderbook-collector.service
 ```
 ## Logs And Status
 Use the systemd journal:
 ```sh
 sudo systemctl status polymarket-orderbook-collector.service
 sudo journalctl -u polymarket-orderbook-collector.service -f
 ```
 Recent logs without following:
 ```sh
 sudo journalctl -u polymarket-orderbook-collector.service --since "1 hour ago"
 ```
 ## Output Files
 Raw gzip JSONL files are written under:
 ```text
 /var/lib/orderbooks/raw_orderbooks/polymarket/orderbooks/<run_id>/
 ```
 Per-cycle manifests are written under:
 ```text
 /var/lib/orderbooks/manifests/polymarket_orderbook_collector_<cycle_id>.json
 ```
 Discovery artifacts are refreshed under:
 ```text
 /var/lib/orderbooks/discovery/
 ```
 ## Restart And Stop Behavior
 The unit uses:
 ```text
 Restart=always
 RestartSec=30s
 TimeoutStopSec=90s
 KillSignal=SIGTERM
 KillMode=control-group
 ```
 The collector handles `SIGTERM` by finishing or timing out the current request,
 closing the gzip output, and writing the manifest. Every cycle writes to a new
 run directory, so closed files are not reopened by the next cycle.
 Stop the service with:
 ```sh
 sudo systemctl stop polymarket-orderbook-collector.service
 ```
 Start it again with:
 ```sh
 sudo systemctl start polymarket-orderbook-collector.service
 ```
 ## Local Validation Without Starting The Service
 These checks do not require root:
 ```sh
 python3 -m py_compile scripts/discover_polymarket_btc_markets.py scripts/collect_polymarket_orderbooks.py
 bash -n scripts/run_polymarket_collector_cycle.sh
 python3 - <<'PY'
 from pathlib import Path
 from scripts.collect_polymarket_orderbooks import load_flat_yaml
 cfg = load_flat_yaml(Path('config/polymarket_collector.vps.example.yaml'))
 required = {
    'discovery_path',
    'output_dir',
    'manifest_path',
    'market_limit',
    'interval_seconds',
    'duration_seconds',
 }
 missing = sorted(required - set(cfg))
 assert not missing, missing
 assert cfg['duration_seconds'] > 0
 print('config parse ok')
 PY
 ```
 If systemd tools are available locally:
 ```sh
 systemd-analyze verify systemd/polymarket-orderbook-collector.service
 ```
 The local machine may not have `/opt/orderbooks` or the `orderbooks` service
 user. Treat missing VPS path or user messages as deployment-environment warnings,
 not collector syntax failures.
 ## Safe Upgrade
 Stop the service, update files, rerun validation, then start the service:
 ```sh
 sudo systemctl stop polymarket-orderbook-collector.service
 cd /opt/orderbooks
 sudo git pull --ff-only
 sudo .venv/bin/python -m py_compile scripts/discover_polymarket_btc_markets.py scripts/collect_polymarket_orderbooks.py
 sudo systemctl daemon-reload
 sudo systemctl start polymarket-orderbook-collector.service
 ```
 Do not remove existing data files during an upgrade. If a bad artifact is found,
 preserve it and label it invalid or deprecated with a replacement path when one
 exists.
 ## Current Limits
 - This package runs the existing raw collector; it does not add a daemon inside
  Python.
 - The systemd loop is a restart model around finite collector cycles.
 - It does not upload files.
 - It does not prove long-run reliability.
 - Production readiness remains blocked until discovery, raw collection, offload,
  and a documented 24h soak test all pass.
--- a/scripts/build_vps_deploy_bundle.sh
+++ b/scripts/build_vps_deploy_bundle.sh
@ -0,0 +1,366 @@
 #!/usr/bin/env bash
 set -euo pipefail
 APP_DIR="${ORDERBOOKS_APP_DIR:-$(pwd)}"
 OUTPUT_DIR="${ORDERBOOKS_VPS_BUNDLE_OUTPUT_DIR:-artifacts/vps}"
 TIMESTAMP="${ORDERBOOKS_VPS_BUNDLE_TIMESTAMP:-$(date -u +%Y%m%dT%H%M%SZ)}"
 BUNDLE_BASENAME="orderbooks_vps_deploy_${TIMESTAMP}"
 TARBALL="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}.tar.gz"
 MANIFEST="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}_manifest.json"
 usage() {
  cat <<'EOF'
 Usage: scripts/build_vps_deploy_bundle.sh [options]
 Build a deployable VPS bundle from the current working tree. The bundle is
 intended to be copied to a VPS and unpacked under /opt/orderbooks.
 Options:
  --app-dir DIR       Source working tree. Default: ORDERBOOKS_APP_DIR or current directory.
  --output-dir DIR    Bundle output directory. Default: artifacts/vps.
  --timestamp TS      Override UTC timestamp used in artifact names.
  --help              Show this help.
 The bundle uses a narrow allowlist and excludes live data, caches, git metadata,
 virtualenvs, rclone config, private keys, wallets, mnemonics, and generated
 artifacts. It does not print secrets and does not write Python bytecode.
 EOF
 }
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --app-dir)
      APP_DIR="$2"
      shift 2
      ;;
    --output-dir)
      OUTPUT_DIR="$2"
      TARBALL="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}.tar.gz"
      MANIFEST="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}_manifest.json"
      shift 2
      ;;
    --timestamp)
      TIMESTAMP="$2"
      BUNDLE_BASENAME="orderbooks_vps_deploy_${TIMESTAMP}"
      TARBALL="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}.tar.gz"
      MANIFEST="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}_manifest.json"
      shift 2
      ;;
    --help)
      usage
      exit 0
      ;;
    *)
      echo "Unknown argument: $1" >&2
      usage >&2
      exit 2
      ;;
  esac
 done
 APP_DIR="${APP_DIR%/}"
 if [[ ! -d "${APP_DIR}" ]]; then
  echo "Source app directory does not exist: ${APP_DIR}" >&2
  exit 1
 fi
 mkdir -p "${OUTPUT_DIR}"
 cd "${APP_DIR}"
 if [[ -e "${TARBALL}" || -e "${MANIFEST}" ]]; then
  echo "Refusing to overwrite existing bundle artifact: ${TARBALL} or ${MANIFEST}" >&2
  exit 1
 fi
 FILELIST="$(mktemp)"
 trap 'rm -f "${FILELIST}"' EXIT
 PYTHONDONTWRITEBYTECODE=1 python3 - "${FILELIST}" "${MANIFEST}" "${TARBALL}" "${TIMESTAMP}" <<'PY_BUNDLE_SELECT'
 import datetime as dt
 import fnmatch
 import hashlib
 import json
 import os
 import sys
 from pathlib import Path
 filelist_path = Path(sys.argv[1])
 manifest_path = Path(sys.argv[2])
 tarball_path = Path(sys.argv[3])
 timestamp = sys.argv[4]
 root = Path.cwd()
 allowed_files = [
    Path("AGENTS.md"),
    Path("ROADMAP.md"),
 ]
 allowed_dirs = [
    Path("config"),
    Path("docs"),
    Path("scripts"),
    Path("systemd"),
    Path("reports/checkpoints"),
 ]
 allowed_globs = [
    "data/manifests/checkpoint_*.json",
 ]
 excluded_patterns = [
    ".git/",
    ".venv/",
    "artifacts/",
    "data/soak_test/",
    "data/live_sample/",
    "data/normalized_sample/",
    "**/__pycache__/",
    "**/*.pyc",
    "**/*.pyo",
    "**/.pytest_cache/",
    "**/.mypy_cache/",
    "**/.ruff_cache/",
    "**/rclone.conf",
    "**/.env",
    "**/*.pem",
    "**/*.key",
    "**/*.p12",
    "**/*.pfx",
    "**/id_rsa*",
    "**/id_ed25519*",
    "**/*mnemonic*",
    "**/*wallet*",
    "**/*credential*",
    "**/*secret*",
 ]
 required_files = [
    "AGENTS.md",
    "ROADMAP.md",
    "config/polymarket_collector.vps.example.yaml",
    "config/rclone.example.md",
    "docs/VPS_CUTOVER_RUNBOOK.md",
    "docs/VPS_DEPLOYMENT.md",
    "docs/GOOGLE_DRIVE_OFFLOAD.md",
    "scripts/build_vps_deploy_bundle.sh",
    "scripts/vps_preflight_check.sh",
    "scripts/vps_runtime_smoke_check.sh",
    "scripts/run_polymarket_collector_cycle.sh",
    "scripts/upload_archive_rclone.sh",
    "scripts/discover_polymarket_btc_markets.py",
    "scripts/collect_polymarket_orderbooks.py",
    "scripts/normalize_polymarket_orderbooks.py",
    "systemd/polymarket-orderbook-collector.service",
    "systemd/polymarket-orderbook-uploader.service",
    "systemd/polymarket-orderbook-uploader.timer",
 ]
 forbidden_path_fragments = [
    "/.git/",
    "/.venv/",
    "/__pycache__/",
    "/data/soak_test/",
    "/data/live_sample/",
    "/data/normalized_sample/",
    "/artifacts/",
 ]
 forbidden_names = {
    "rclone.conf",
    ".env",
    "id_rsa",
    "id_ed25519",
 }
 forbidden_suffixes = {
    ".pyc",
    ".pyo",
    ".pem",
    ".key",
    ".p12",
    ".pfx",
 }
 secretish_name_tokens = [
    "mnemonic",
    "wallet",
    "credential",
    "secret",
 ]
 def as_posix(path: Path) -> str:
    return path.as_posix()
 def is_forbidden(path: Path) -> tuple[bool, str | None]:
    rel = as_posix(path)
    wrapped = f"/{rel}/" if path.is_dir() else f"/{rel}"
    if path.is_absolute() or ".." in path.parts:
        return True, "absolute_or_parent_path"
    for fragment in forbidden_path_fragments:
        if fragment in wrapped:
            return True, f"forbidden_fragment:{fragment}"
    if any(part in {".git", ".venv", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache"} for part in path.parts):
        return True, "forbidden_cache_or_metadata_dir"
    lower_name = path.name.lower()
    if lower_name in forbidden_names:
        return True, f"forbidden_name:{path.name}"
    if path.suffix.lower() in forbidden_suffixes:
        return True, f"forbidden_suffix:{path.suffix}"
    if any(token in lower_name for token in secretish_name_tokens):
        return True, f"secretish_name:{path.name}"
    if rel.startswith(("data/soak_test/", "data/live_sample/", "data/normalized_sample/", "artifacts/")):
        return True, "forbidden_prefix"
    return False, None
 def iter_allowed_files():
    seen = set()
    for path in allowed_files:
        if path.is_file() and path not in seen:
            seen.add(path)
            yield path
    for directory in allowed_dirs:
        if not directory.exists():
            continue
        for path in sorted(directory.rglob("*")):
            if path.is_file() and path not in seen:
                seen.add(path)
                yield path
    for pattern in allowed_globs:
        for path in sorted(root.glob(pattern)):
            if path.is_file() and path not in seen:
                seen.add(path)
                yield path
 def sha256_file(path: Path) -> str:
    digest = hashlib.sha256()
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            digest.update(chunk)
    return digest.hexdigest()
 included = []
 excluded = []
 for path in iter_allowed_files():
    forbidden, reason = is_forbidden(path)
    if forbidden:
        excluded.append({"path": as_posix(path), "reason": reason})
        continue
    stat = path.stat()
    included.append({
        "path": as_posix(path),
        "bytes": stat.st_size,
        "sha256": sha256_file(path),
    })
 included_paths = sorted(item["path"] for item in included)
 missing_required = sorted(path for path in required_files if path not in included_paths)
 if missing_required:
    raise SystemExit(f"missing required bundle files: {missing_required}")
 if not included:
    raise SystemExit("bundle file list is empty")
 filelist_path.write_bytes(b"".join(path.encode("utf-8") + b"\0" for path in included_paths))
 created_at = dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
 manifest = {
    "schema_name": "vps_deploy_bundle_manifest",
    "schema_version": 1,
    "created_at_utc": created_at,
    "timestamp": timestamp,
    "tarball_path": as_posix(tarball_path),
    "manifest_path": as_posix(manifest_path),
    "source_root": str(root),
    "bundle_intent": "Copy to a VPS and unpack under /opt/orderbooks; VPS execution remains pending.",
    "production_ready": False,
    "vps_deployed": False,
    "included_roots": [str(path) for path in allowed_files + allowed_dirs] + allowed_globs,
    "excluded_patterns": excluded_patterns,
    "required_files": required_files,
    "included_file_count": len(included),
    "included_files": included,
    "excluded_selected_files": excluded,
    "missing_required_files": missing_required,
    "validation": {
        "required_files_present_before_tar": not missing_required,
        "forbidden_paths_absent_before_tar": True,
        "tarball_validation_completed": False,
    },
 }
 manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
 PY_BUNDLE_SELECT
 tar --create --gzip --file "${TARBALL}" --null --files-from "${FILELIST}" --owner=0 --group=0 --numeric-owner
 PYTHONDONTWRITEBYTECODE=1 python3 - "${TARBALL}" "${MANIFEST}" <<'PY_BUNDLE_VALIDATE'
 import hashlib
 import json
 import sys
 import tarfile
 from pathlib import Path
 tarball_path = Path(sys.argv[1])
 manifest_path = Path(sys.argv[2])
 manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
 required_files = set(manifest["required_files"])
 def sha256_file(path: Path) -> str:
    digest = hashlib.sha256()
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            digest.update(chunk)
    return digest.hexdigest()
 def forbidden_reason(name: str) -> str | None:
    parts = name.split("/")
    lower_name = parts[-1].lower()
    if name.startswith("/") or any(part == ".." for part in parts):
        return "absolute_or_parent_path"
    if parts[0] in {".git", ".venv", "artifacts"}:
        return f"forbidden_top_level:{parts[0]}"
    if len(parts) >= 2 and parts[0] == "data" and parts[1] in {"soak_test", "live_sample", "normalized_sample"}:
        return f"forbidden_data_dir:data/{parts[1]}"
    if any(part in {".git", ".venv", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache"} for part in parts):
        return "forbidden_cache_or_metadata_dir"
    if lower_name in {"rclone.conf", ".env", "id_rsa", "id_ed25519"}:
        return f"forbidden_name:{lower_name}"
    if any(lower_name.endswith(suffix) for suffix in (".pyc", ".pyo", ".pem", ".key", ".p12", ".pfx")):
        return "forbidden_suffix"
    if any(token in lower_name for token in ("mnemonic", "wallet", "credential", "secret")):
        return "secretish_name"
    return None
 with tarfile.open(tarball_path, "r:gz") as archive:
    members = [member for member in archive.getmembers() if member.isfile()]
    names = sorted(member.name for member in members)
 forbidden = [{"path": name, "reason": forbidden_reason(name)} for name in names if forbidden_reason(name)]
 missing_required = sorted(required_files - set(names))
 if forbidden or missing_required:
    manifest["validation"].update({
        "tarball_validation_completed": True,
        "forbidden_paths_absent_in_tarball": not forbidden,
        "required_files_present_in_tarball": not missing_required,
        "forbidden_paths_in_tarball": forbidden,
        "missing_required_files_in_tarball": missing_required,
    })
    manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
    raise SystemExit(f"bundle validation failed forbidden={forbidden} missing_required={missing_required}")
 manifest["tarball_bytes"] = tarball_path.stat().st_size
 manifest["tarball_sha256"] = sha256_file(tarball_path)
 manifest["tarball_content_count"] = len(names)
 manifest["tarball_contents"] = names
 manifest["validation"].update({
    "tarball_validation_completed": True,
    "forbidden_paths_absent_in_tarball": True,
    "required_files_present_in_tarball": True,
    "forbidden_paths_in_tarball": [],
    "missing_required_files_in_tarball": [],
 })
 manifest["gate_status"] = "PASS"
 manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
 PY_BUNDLE_VALIDATE
 printf 'BUNDLE_TARBALL=%s\n' "${TARBALL}"
 printf 'BUNDLE_MANIFEST=%s\n' "${MANIFEST}"
 python3 - <<'PY_PRINT' "${MANIFEST}"
 import json
 import sys
 from pathlib import Path
 m = json.loads(Path(sys.argv[1]).read_text(encoding="utf-8"))
 print(f"BUNDLE_SHA256={m['tarball_sha256']}")
 print(f"BUNDLE_BYTES={m['tarball_bytes']}")
 print(f"BUNDLE_FILE_COUNT={m['tarball_content_count']}")
 PY_PRINT
--- a/scripts/collect_polymarket_orderbooks.py
+++ b/scripts/collect_polymarket_orderbooks.py
@ -0,0 +1,668 @@
 #!/usr/bin/env python3
 """Minimal raw Polymarket order-book snapshot sample collector.
 Checkpoint 4 scope: finite sample run only. This script reads the BTC discovery
 artifact, fetches public CLOB batch order books for a small market set, writes
 raw gzip JSONL envelopes, and closes with a manifest. It is not a daemon and it
 does not trade.
 """
 from __future__ import annotations
 import argparse
 import datetime as dt
 import gzip
 import hashlib
 import json
 import signal
 import sys
 import time
 import urllib.error
 import urllib.request
 from pathlib import Path
 from typing import Any
 COLLECTOR_NAME = "polymarket_orderbook_collector"
 COLLECTOR_VERSION = "0.1.0"
 SCHEMA_NAME = "raw_orderbook_snapshot"
 SCHEMA_VERSION = 1
 CLOB_BOOKS_URL = "https://clob.polymarket.com/books"
 DEFAULT_CONFIG_PATH = Path("config/polymarket_collector.example.yaml")
 DEFAULT_DISCOVERY_PATH = Path("data/discovery/polymarket_btc_markets_latest.json")
 DEFAULT_OUTPUT_DIR = Path("data/live_sample")
 DEFAULT_MANIFEST_PATH = Path("data/manifests/orderbook_collector_sample_manifest.json")
 SAFE_RESPONSE_HEADERS = {
    "cache-control",
    "cf-cache-status",
    "cf-ray",
    "content-length",
    "content-type",
    "date",
    "retry-after",
    "server",
    "x-ratelimit-limit",
    "x-ratelimit-remaining",
    "x-ratelimit-reset",
    "ratelimit-limit",
    "ratelimit-remaining",
    "ratelimit-reset",
 }
 STOP_REQUESTED = False
 STOP_SIGNAL: str | None = None
 def handle_stop(signum: int, _frame: Any) -> None:
    global STOP_REQUESTED, STOP_SIGNAL
    STOP_REQUESTED = True
    STOP_SIGNAL = signal.Signals(signum).name
 def utc_now() -> dt.datetime:
    return dt.datetime.now(dt.UTC)
 def iso_z(value: dt.datetime | None = None) -> str:
    value = value or utc_now()
    return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
 def compact_timestamp(value: dt.datetime | None = None) -> str:
    value = value or utc_now()
    return value.astimezone(dt.UTC).strftime("%Y%m%dT%H%M%SZ")
 def parse_iso(value: Any) -> dt.datetime | None:
    if not isinstance(value, str) or not value.strip():
        return None
    text = value.strip()
    if text.endswith("Z"):
        text = text[:-1] + "+00:00"
    try:
        parsed = dt.datetime.fromisoformat(text)
    except ValueError:
        return None
    if parsed.tzinfo is None:
        parsed = parsed.replace(tzinfo=dt.UTC)
    return parsed.astimezone(dt.UTC)
 def sha256_file(path: Path) -> str:
    digest = hashlib.sha256()
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            digest.update(chunk)
    return digest.hexdigest()
 def parse_scalar(value: str) -> Any:
    value = value.strip()
    if not value:
        return ""
    if value[0] in {"'", '"'} and value[-1:] == value[0]:
        return value[1:-1]
    lower = value.lower()
    if lower in {"true", "false"}:
        return lower == "true"
    if lower in {"null", "none"}:
        return None
    try:
        return int(value)
    except ValueError:
        pass
    try:
        return float(value)
    except ValueError:
        return value
 def load_flat_yaml(path: Path) -> dict[str, Any]:
    """Parse the flat YAML subset used by the example config."""
    config: dict[str, Any] = {}
    if not path.exists():
        return config
    for line_number, raw_line in enumerate(path.read_text(encoding="utf-8").splitlines(), 1):
        line = raw_line.split("#", 1)[0].strip()
        if not line:
            continue
        if ":" not in line:
            raise ValueError(f"Unsupported config line {line_number}: {raw_line}")
        key, value = line.split(":", 1)
        key = key.strip()
        if not key:
            raise ValueError(f"Missing config key on line {line_number}")
        config[key] = parse_scalar(value)
    return config
 def config_digest(path: Path | None) -> str | None:
    if path is None or not path.exists():
        return None
    return sha256_file(path)
 def filter_headers(headers: Any) -> dict[str, str]:
    safe: dict[str, str] = {}
    for key, value in dict(headers).items():
        if key.lower() in SAFE_RESPONSE_HEADERS:
            safe[key] = value
    return safe
 def http_post_json(
    *,
    url: str,
    json_body: Any,
    timeout_seconds: float,
    max_retries: int,
    backoff_seconds: float,
 ) -> dict[str, Any]:
    body_bytes = json.dumps(json_body, separators=(",", ":")).encode("utf-8")
    attempts: list[dict[str, Any]] = []
    final_json: Any | None = None
    final_text_preview: str | None = None
    final_json_error: str | None = None
    final_status_code: int | None = None
    final_headers: dict[str, str] = {}
    for attempt_index in range(max_retries + 1):
        started_at = iso_z()
        started_monotonic = time.monotonic()
        status_code: int | None = None
        response_headers: dict[str, str] = {}
        response_text = ""
        error: str | None = None
        try:
            request = urllib.request.Request(
                url,
                data=body_bytes,
                headers={
                    "Accept": "application/json",
                    "Content-Type": "application/json",
                    "User-Agent": "orderbooks-checkpoint-4-sample/0.1.0",
                },
                method="POST",
            )
            with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
                status_code = response.status
                response_headers = filter_headers(response.headers)
                response_text = response.read().decode("utf-8", errors="replace")
        except urllib.error.HTTPError as exc:
            status_code = exc.code
            response_headers = filter_headers(exc.headers)
            response_text = exc.read().decode("utf-8", errors="replace")
            error = f"HTTPError: {exc}"
        except Exception as exc:  # noqa: BLE001 - preserve request failure evidence
            error = f"{type(exc).__name__}: {exc}"
        duration_ms = round((time.monotonic() - started_monotonic) * 1000, 3)
        parsed_json = None
        json_error = None
        if response_text:
            try:
                parsed_json = json.loads(response_text)
            except json.JSONDecodeError as exc:
                json_error = str(exc)
        attempts.append(
            {
                "attempt": attempt_index + 1,
                "started_at_utc": started_at,
                "ended_at_utc": iso_z(),
                "duration_ms": duration_ms,
                "status_code": status_code,
                "headers": response_headers,
                "error": error,
                "json_error": json_error,
            }
        )
        final_json = parsed_json
        final_json_error = json_error
        final_text_preview = response_text[:1000] if parsed_json is None else None
        final_status_code = status_code
        final_headers = response_headers
        retryable = status_code == 429 or (status_code is not None and 500 <= status_code <= 599)
        if error is None and status_code is not None and 200 <= status_code < 300:
            break
        if not retryable or attempt_index >= max_retries or STOP_REQUESTED:
            break
        retry_after = response_headers.get("Retry-After") or response_headers.get("retry-after")
        sleep_seconds = backoff_seconds * (2**attempt_index)
        if retry_after:
            try:
                sleep_seconds = max(sleep_seconds, float(retry_after))
            except ValueError:
                pass
        time.sleep(sleep_seconds)
    return {
        "request": {
            "method": "POST",
            "url": url,
            "json_body": json_body,
        },
        "response": {
            "status_code": final_status_code,
            "headers": final_headers,
            "json": final_json,
            "json_error": final_json_error,
            "text_preview": final_text_preview,
        },
        "attempts": attempts,
        "duration_ms": round(sum(attempt["duration_ms"] for attempt in attempts), 3),
        "ok": final_status_code is not None and 200 <= final_status_code < 300 and final_json_error is None,
    }
 def load_discovery(path: Path) -> dict[str, Any]:
    return json.loads(path.read_text(encoding="utf-8"))
 def market_is_usable(market: dict[str, Any], now: dt.datetime, safety_seconds: int) -> tuple[bool, list[str]]:
    reasons: list[str] = []
    if market.get("active") is not True:
        reasons.append("not_active")
    if market.get("closed") is not False:
        reasons.append("closed")
    if market.get("accepting_orders") is not True:
        reasons.append("not_accepting_orders")
    if market.get("enable_order_book") is not True:
        reasons.append("order_book_not_enabled")
    end_time = parse_iso(market.get("end_time_utc"))
    if end_time is None:
        reasons.append("missing_end_time")
    elif end_time <= now + dt.timedelta(seconds=safety_seconds):
        reasons.append("too_close_to_end_or_expired")
    tokens = market.get("tokens")
    if not isinstance(tokens, list) or len(tokens) < 2:
        reasons.append("missing_two_tokens")
    else:
        outcomes = [token.get("outcome") for token in tokens if isinstance(token, dict)]
        token_ids = [token.get("token_id") for token in tokens if isinstance(token, dict)]
        if outcomes[:2] != ["Up", "Down"] or not all(token_ids[:2]):
            reasons.append("bad_up_down_token_mapping")
    return not reasons, reasons
 def select_markets(
    discovery: dict[str, Any],
    *,
    market_limit: int,
    market_end_safety_seconds: int,
 ) -> tuple[list[dict[str, Any]], dict[str, int]]:
    now = utc_now()
    selected: list[dict[str, Any]] = []
    rejection_counts: dict[str, int] = {}
    markets = discovery.get("normalized_markets") or []
    for market in markets:
        if not isinstance(market, dict):
            rejection_counts["not_object"] = rejection_counts.get("not_object", 0) + 1
            continue
        usable, reasons = market_is_usable(market, now, market_end_safety_seconds)
        if not usable:
            for reason in reasons:
                rejection_counts[reason] = rejection_counts.get(reason, 0) + 1
            continue
        selected.append(market)
        if len(selected) >= market_limit:
            break
    return selected, dict(sorted(rejection_counts.items()))
 def flatten_tokens(markets: list[dict[str, Any]]) -> list[dict[str, Any]]:
    tokens: list[dict[str, Any]] = []
    for market in markets:
        for token in market.get("tokens", [])[:2]:
            tokens.append(
                {
                    "market_name": market.get("market_name"),
                    "market_slug": market.get("market_slug"),
                    "condition_id": market.get("condition_id"),
                    "token_id": str(token.get("token_id")),
                    "outcome": token.get("outcome"),
                    "market_end_time_utc": market.get("end_time_utc"),
                }
            )
    return tokens
 def build_snapshot_envelope(
    *,
    raw_book: dict[str, Any],
    token_meta: dict[str, Any],
    collected_at_utc: str,
    sequence: int,
    request_record: dict[str, Any],
    response_index: int,
 ) -> dict[str, Any]:
    return {
        "schema_name": SCHEMA_NAME,
        "schema_version": SCHEMA_VERSION,
        "collector": {
            "name": COLLECTOR_NAME,
            "version": COLLECTOR_VERSION,
        },
        "market": {
            "market_name": token_meta.get("market_name"),
            "market_slug": token_meta.get("market_slug"),
            "condition_id": token_meta.get("condition_id"),
            "token_id": token_meta.get("token_id"),
            "outcome": token_meta.get("outcome"),
            "market_end_time_utc": token_meta.get("market_end_time_utc"),
        },
        "collection": {
            "collected_at_utc": collected_at_utc,
            "sequence": sequence,
            "response_index": response_index,
        },
        "request": {
            "method": request_record["request"]["method"],
            "url": request_record["request"]["url"],
            "params": None,
            "json_body": request_record["request"]["json_body"],
            "status_code": request_record["response"]["status_code"],
            "duration_ms": request_record["duration_ms"],
            "attempts": request_record["attempts"],
        },
        "raw": raw_book,
    }
 def summarize_output_file(path: Path, rows_written: int) -> dict[str, Any]:
    return {
        "path": path.as_posix(),
        "status": "valid" if path.exists() and path.stat().st_size > 0 else "missing",
        "bytes": path.stat().st_size if path.exists() else 0,
        "rows": rows_written,
        "sha256": sha256_file(path) if path.exists() else None,
    }
 def write_manifest(path: Path, manifest: dict[str, Any]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
 def config_value(config: dict[str, Any], args: argparse.Namespace, key: str, default: Any) -> Any:
    cli_value = getattr(args, key)
    if cli_value is not None:
        return cli_value
    return config.get(key, default)
 def build_runtime_config(args: argparse.Namespace) -> dict[str, Any]:
    config_path = args.config
    file_config = load_flat_yaml(config_path) if config_path else {}
    runtime = {
        "discovery_path": Path(config_value(file_config, args, "discovery_path", DEFAULT_DISCOVERY_PATH)),
        "output_dir": Path(config_value(file_config, args, "output_dir", DEFAULT_OUTPUT_DIR)),
        "manifest_path": Path(config_value(file_config, args, "manifest_path", DEFAULT_MANIFEST_PATH)),
        "market_limit": int(config_value(file_config, args, "market_limit", 2)),
        "interval_seconds": float(config_value(file_config, args, "interval_seconds", 30.0)),
        "duration_seconds": float(config_value(file_config, args, "duration_seconds", 300.0)),
        "request_timeout_seconds": float(config_value(file_config, args, "request_timeout_seconds", 15.0)),
        "max_retries": int(config_value(file_config, args, "max_retries", 2)),
        "backoff_seconds": float(config_value(file_config, args, "backoff_seconds", 2.0)),
        "market_end_safety_seconds": int(config_value(file_config, args, "market_end_safety_seconds", 420)),
        "clob_books_url": str(config_value(file_config, args, "clob_books_url", CLOB_BOOKS_URL)),
        "config_path": config_path,
        "config_sha256": config_digest(config_path),
        "config_snapshot": file_config,
    }
    if runtime["market_limit"] < 1:
        raise ValueError("market_limit must be >= 1")
    if runtime["interval_seconds"] <= 0:
        raise ValueError("interval_seconds must be > 0")
    if runtime["duration_seconds"] <= 0:
        raise ValueError("duration_seconds must be > 0")
    return runtime
 def run_collection(runtime: dict[str, Any], command: str) -> tuple[dict[str, Any], Path]:
    signal.signal(signal.SIGINT, handle_stop)
    signal.signal(signal.SIGTERM, handle_stop)
    started = utc_now()
    started_at_utc = iso_z(started)
    discovery_path: Path = runtime["discovery_path"]
    discovery = load_discovery(discovery_path)
    selected_markets, rejection_counts = select_markets(
        discovery,
        market_limit=runtime["market_limit"],
        market_end_safety_seconds=runtime["market_end_safety_seconds"],
    )
    warnings: list[str] = []
    failures: list[dict[str, Any]] = []
    if not selected_markets:
        warnings.append("No usable active BTC markets found in discovery input.")
    tokens = flatten_tokens(selected_markets)
    run_id = compact_timestamp(started)
    output_dir = runtime["output_dir"] / "polymarket" / "orderbooks" / run_id
    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / f"polymarket_orderbooks_{run_id}.jsonl.gz"
    request_count = 0
    success_count = 0
    failure_count = 0
    status_code_counts: dict[str, int] = {}
    rows_written = 0
    sequence = 0
    token_row_counts = {token["token_id"]: 0 for token in tokens}
    deadline = time.monotonic() + runtime["duration_seconds"]
    token_by_id = {token["token_id"]: token for token in tokens}
    request_body = [{"token_id": token["token_id"]} for token in tokens]
    with gzip.open(output_file, "wt", encoding="utf-8") as handle:
        while tokens and not STOP_REQUESTED and time.monotonic() < deadline:
            loop_started = time.monotonic()
            collected_at_utc = iso_z()
            request_count += 1
            request_record = http_post_json(
                url=runtime["clob_books_url"],
                json_body=request_body,
                timeout_seconds=runtime["request_timeout_seconds"],
                max_retries=runtime["max_retries"],
                backoff_seconds=runtime["backoff_seconds"],
            )
            status_code = request_record["response"]["status_code"]
            status_key = str(status_code)
            status_code_counts[status_key] = status_code_counts.get(status_key, 0) + 1
            if request_record["ok"] and isinstance(request_record["response"]["json"], list):
                success_count += 1
                for response_index, raw_book in enumerate(request_record["response"]["json"]):
                    if not isinstance(raw_book, dict):
                        failure_count += 1
                        failures.append(
                            {
                                "collected_at_utc": collected_at_utc,
                                "reason": "book_response_item_not_object",
                                "response_index": response_index,
                            }
                        )
                        continue
                    asset_id = str(raw_book.get("asset_id") or "")
                    token_meta = token_by_id.get(asset_id)
                    if token_meta is None:
                        failure_count += 1
                        failures.append(
                            {
                                "collected_at_utc": collected_at_utc,
                                "reason": "unknown_asset_id_in_book_response",
                                "asset_id": asset_id,
                            }
                        )
                        continue
                    sequence += 1
                    envelope = build_snapshot_envelope(
                        raw_book=raw_book,
                        token_meta=token_meta,
                        collected_at_utc=collected_at_utc,
                        sequence=sequence,
                        request_record=request_record,
                        response_index=response_index,
                    )
                    handle.write(json.dumps(envelope, separators=(",", ":"), sort_keys=True) + "\n")
                    rows_written += 1
                    token_row_counts[asset_id] = token_row_counts.get(asset_id, 0) + 1
                handle.flush()
            else:
                failure_count += 1
                failures.append(
                    {
                        "collected_at_utc": collected_at_utc,
                        "reason": "request_failed_or_non_json_list",
                        "status_code": status_code,
                        "attempts": request_record["attempts"],
                        "json_error": request_record["response"]["json_error"],
                        "text_preview": request_record["response"]["text_preview"],
                    }
                )
            remaining_interval = runtime["interval_seconds"] - (time.monotonic() - loop_started)
            while remaining_interval > 0 and not STOP_REQUESTED and time.monotonic() < deadline:
                sleep_for = min(remaining_interval, deadline - time.monotonic(), 1.0)
                if sleep_for <= 0:
                    break
                time.sleep(sleep_for)
                remaining_interval = runtime["interval_seconds"] - (time.monotonic() - loop_started)
    ended = utc_now()
    ended_at_utc = iso_z(ended)
    duration_seconds_actual = round((ended - started).total_seconds(), 3)
    if STOP_REQUESTED:
        warnings.append(f"Graceful shutdown requested by {STOP_SIGNAL}.")
    if runtime["duration_seconds"] < 300:
        warnings.append("Configured run duration was shorter than the roadmap 5-minute sample target.")
    if not failures and request_count > 0:
        failures = []
    output_summary = summarize_output_file(output_file, rows_written)
    gate_status = "PASS" if rows_written > 0 and all(count > 0 for count in token_row_counts.values()) else "FAIL"
    if not tokens:
        gate_status = "BLOCKED"
    if request_count == 0:
        gate_status = "FAIL" if tokens else "BLOCKED"
    manifest = {
        "schema_name": "orderbook_collector_sample_manifest",
        "schema_version": 1,
        "checkpoint_id": 4,
        "checkpoint_name": "Minimal Orderbook Snapshot Collector",
        "gate_status": gate_status,
        "collector": {
            "name": COLLECTOR_NAME,
            "version": COLLECTOR_VERSION,
        },
        "started_at_utc": started_at_utc,
        "ended_at_utc": ended_at_utc,
        "run_duration_seconds": duration_seconds_actual,
        "configured_duration_seconds": runtime["duration_seconds"],
        "interval_seconds": runtime["interval_seconds"],
        "command": command,
        "config": {
            "path": runtime["config_path"].as_posix() if runtime["config_path"] else None,
            "sha256": runtime["config_sha256"],
            "snapshot": runtime["config_snapshot"],
            "effective": {
                "discovery_path": discovery_path.as_posix(),
                "output_dir": runtime["output_dir"].as_posix(),
                "manifest_path": runtime["manifest_path"].as_posix(),
                "market_limit": runtime["market_limit"],
                "interval_seconds": runtime["interval_seconds"],
                "duration_seconds": runtime["duration_seconds"],
                "request_timeout_seconds": runtime["request_timeout_seconds"],
                "max_retries": runtime["max_retries"],
                "backoff_seconds": runtime["backoff_seconds"],
                "market_end_safety_seconds": runtime["market_end_safety_seconds"],
                "clob_books_url": runtime["clob_books_url"],
            },
        },
        "discovery": {
            "path": discovery_path.as_posix(),
            "fetched_at_utc": discovery.get("fetched_at_utc"),
            "source_summary": discovery.get("summary"),
            "rejection_counts_before_selection": rejection_counts,
        },
        "markets_tracked": [
            {
                "market_name": market.get("market_name"),
                "market_slug": market.get("market_slug"),
                "condition_id": market.get("condition_id"),
                "end_time_utc": market.get("end_time_utc"),
            }
            for market in selected_markets
        ],
        "tokens_tracked": tokens,
        "request_count": request_count,
        "success_count": success_count,
        "failure_count": failure_count,
        "status_code_counts": dict(sorted(status_code_counts.items())),
        "rows_written": rows_written,
        "token_row_counts": token_row_counts,
        "output_files": [output_summary],
        "failures": failures,
        "warnings": warnings,
        "known_gaps": [
            "This is a short run-rotated sample, not a daemon.",
            "Hourly rotation is documented but not implemented in this checkpoint.",
            "No websocket capture, normalization, upload, systemd unit, dashboard, database, or trading behavior is included.",
            "A 5-minute sample proves file-writing behavior only; it does not prove 24/7 reliability.",
        ],
        "fake_progress_risk": "A small successful sample can still hide long-run gaps, stale discovery, endpoint schema drift, and missed intervals. Reliability remains gated on the future 24h soak test.",
        "next_step": "Checkpoint 5 should normalize this raw sample while preserving raw file references, or rerun a fresh short sample if the orchestrator wants more raw evidence first.",
    }
    return manifest, output_file
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Collect a bounded raw gzip JSONL sample of Polymarket BTC order books."
    )
    parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH)
    parser.add_argument("--discovery-path", type=Path, default=None)
    parser.add_argument("--output-dir", type=Path, default=None)
    parser.add_argument("--manifest-path", type=Path, default=None)
    parser.add_argument("--market-limit", type=int, default=None)
    parser.add_argument("--interval-seconds", type=float, default=None)
    parser.add_argument("--duration-seconds", type=float, default=None)
    parser.add_argument("--request-timeout-seconds", type=float, default=None)
    parser.add_argument("--max-retries", type=int, default=None)
    parser.add_argument("--backoff-seconds", type=float, default=None)
    parser.add_argument("--market-end-safety-seconds", type=int, default=None)
    parser.add_argument("--clob-books-url", type=str, default=None)
    return parser.parse_args()
 def main() -> int:
    args = parse_args()
    command = " ".join([Path(sys.argv[0]).as_posix(), *sys.argv[1:]])
    runtime = build_runtime_config(args)
    manifest, output_file = run_collection(runtime, command)
    write_manifest(runtime["manifest_path"], manifest)
    print(
        json.dumps(
            {
                "gate_status": manifest["gate_status"],
                "manifest_path": runtime["manifest_path"].as_posix(),
                "output_file": output_file.as_posix(),
                "markets_tracked": manifest["markets_tracked"],
                "tokens_tracked": len(manifest["tokens_tracked"]),
                "request_count": manifest["request_count"],
                "success_count": manifest["success_count"],
                "failure_count": manifest["failure_count"],
                "rows_written": manifest["rows_written"],
                "warnings": manifest["warnings"],
            },
            indent=2,
            sort_keys=True,
        )
    )
    return 0 if manifest["gate_status"] == "PASS" else 1
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/scripts/deploy/bootstrap_orderbooks_k8s.sh
+++ b/scripts/deploy/bootstrap_orderbooks_k8s.sh
@ -0,0 +1,146 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ROOT_DIR="$(cd "$(dirname "$0")/../.." && pwd)"
 PLATFORM_REPO_DIR="${PLATFORM_REPO_DIR:-/home/philipp/dev/ae/nuri/unrip3}"
 PLATFORM_ENV_FILE="${PLATFORM_ENV_FILE:-$PLATFORM_REPO_DIR/scripts/hetzner/bootstrap-secrets.env}"
 PLATFORM_RESOLVED_ENV_FILE="${PLATFORM_RESOLVED_ENV_FILE:-$PLATFORM_REPO_DIR/.state/hetzner/bootstrap-secrets.resolved.env}"
 KUBECONFIG_PATH="${KUBECONFIG_PATH:-$PLATFORM_REPO_DIR/.state/hetzner/kubeconfig.yaml}"
 CI_KUBECONFIG_PATH="${CI_KUBECONFIG_PATH:-$PLATFORM_REPO_DIR/.state/hetzner/kubeconfig.incluster.yaml}"
 PROJECT_NAME="${PROJECT_NAME:-orderbooks}"
 PROJECT_NAMESPACE="${PROJECT_NAMESPACE:-orderbooks}"
 PROJECT_DEPLOYMENTS="${PROJECT_DEPLOYMENTS:-orderbooks-collector}"
 PROJECT_REGISTRY_SECRET_NAME="${PROJECT_REGISTRY_SECRET_NAME:-orderbooks-registry-creds}"
 RCLONE_SECRET_NAME="${RCLONE_SECRET_NAME:-orderbooks-rclone-config}"
 RCLONE_SECRET_KEY="${RCLONE_SECRET_KEY:-rclone.conf}"
 FORGEJO_REPO_OWNER="${FORGEJO_REPO_OWNER:-philipp}"
 FORGEJO_REPO_NAME="${FORGEJO_REPO_NAME:-orderbooks}"
 FORGEJO_REPO_PRIVATE="${FORGEJO_REPO_PRIVATE:-0}"
 require() {
  command -v "$1" >/dev/null 2>&1 || {
    echo "missing required command: $1" >&2
    exit 1
  }
 }
 load_env_defaults() {
  local file="$1"
  [[ -f "$file" ]] || return 0
  eval "$(
    python3 - "$file" <<'PY_LOAD_ENV'
 import os
 import shlex
 import sys
 for raw in open(sys.argv[1], 'r', encoding='utf-8'):
    line = raw.strip()
    if not line or line.startswith('#'):
        continue
    if line.startswith('export '):
        line = line[len('export '):]
    if '=' not in line:
        continue
    key, value = line.split('=', 1)
    key = key.strip()
    value = value.strip()
    if len(value) >= 2 and value[0] == value[-1] and value[0] in {'\"', "'"}:
        value = value[1:-1]
    if key in os.environ:
        continue
    print(f'export {key}={shlex.quote(value)}')
 PY_LOAD_ENV
  )"
 }
 require kubectl
 require python3
 require base64
 load_env_defaults "$PLATFORM_ENV_FILE"
 load_env_defaults "$PLATFORM_RESOLVED_ENV_FILE"
 # Force orderbooks app identity after loading platform defaults. The platform
 # env file may describe the platform repo itself, not this app repo.
 PROJECT_NAME="${ORDERBOOKS_PROJECT_NAME:-orderbooks}"
 PROJECT_NAMESPACE="${ORDERBOOKS_PROJECT_NAMESPACE:-orderbooks}"
 PROJECT_DEPLOYMENTS="${ORDERBOOKS_PROJECT_DEPLOYMENTS:-orderbooks-collector}"
 PROJECT_REGISTRY_SECRET_NAME="${ORDERBOOKS_PROJECT_REGISTRY_SECRET_NAME:-orderbooks-registry-creds}"
 RCLONE_SECRET_NAME="${ORDERBOOKS_RCLONE_SECRET_NAME:-orderbooks-rclone-config}"
 RCLONE_SECRET_KEY="${ORDERBOOKS_RCLONE_SECRET_KEY:-rclone.conf}"
 FORGEJO_REPO_OWNER="${ORDERBOOKS_FORGEJO_REPO_OWNER:-philipp}"
 FORGEJO_REPO_NAME="${ORDERBOOKS_FORGEJO_REPO_NAME:-orderbooks}"
 FORGEJO_REPO_PRIVATE="${ORDERBOOKS_FORGEJO_REPO_PRIVATE:-0}"
 : "${KUBECONFIG_PATH:?missing kubeconfig path}"
 : "${CI_KUBECONFIG_PATH:?missing CI kubeconfig path}"
 [[ -f "$KUBECONFIG_PATH" ]] || { echo "missing kubeconfig file" >&2; exit 1; }
 [[ -f "$CI_KUBECONFIG_PATH" ]] || { echo "missing in-cluster kubeconfig file" >&2; exit 1; }
 export KUBECONFIG="$KUBECONFIG_PATH"
 if [[ -z "${FORGEJO_URL:-}" ]]; then
  if [[ -n "${FORGEJO_ROOT_URL:-}" ]]; then
    FORGEJO_URL="$FORGEJO_ROOT_URL"
  elif [[ -n "${FORGEJO_DOMAIN:-}" ]]; then
    FORGEJO_URL="https://${FORGEJO_DOMAIN}"
  else
    echo "missing Forgejo URL" >&2
    exit 1
  fi
 fi
 : "${FORGEJO_ADMIN_USERNAME:?missing Forgejo admin username}"
 if [[ -z "${FORGEJO_TOKEN:-}" ]]; then
  : "${FORGEJO_ADMIN_PASSWORD:?missing Forgejo password or token}"
 fi
 if [[ -z "${REGISTRY_HOST:-}" ]]; then
  if [[ -n "${REGISTRY_DOMAIN:-}" ]]; then
    REGISTRY_HOST="$REGISTRY_DOMAIN"
  else
    echo "missing registry host" >&2
    exit 1
  fi
 fi
 : "${REGISTRY_USERNAME:?missing registry username}"
 : "${REGISTRY_PASSWORD:?missing registry password}"
 echo "ensuring namespace ${PROJECT_NAMESPACE}"
 kubectl create namespace "$PROJECT_NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
 echo "upserting registry secret ${PROJECT_REGISTRY_SECRET_NAME}"
 kubectl -n "$PROJECT_NAMESPACE" create secret docker-registry "$PROJECT_REGISTRY_SECRET_NAME" \
  --docker-server="$REGISTRY_HOST" \
  --docker-username="$REGISTRY_USERNAME" \
  --docker-password="$REGISTRY_PASSWORD" \
  --dry-run=client -o yaml | kubectl apply -f -
 echo "checking rclone secret key presence"
 kubectl -n "$PROJECT_NAMESPACE" get secret "$RCLONE_SECRET_NAME" \
  -o "go-template={{if index .data \"${RCLONE_SECRET_KEY}\"}}rclone_secret_key_present{{else}}rclone_secret_key_missing{{end}}{{\"\\n\"}}"
 echo "upserting Forgejo repo and Actions settings"
 forgejo_args=()
 if [[ -n "${FORGEJO_TOKEN:-}" ]]; then
  forgejo_args+=(--token "$FORGEJO_TOKEN")
 else
  forgejo_args+=(--admin-username "$FORGEJO_ADMIN_USERNAME" --admin-password "$FORGEJO_ADMIN_PASSWORD")
 fi
 if [[ "$FORGEJO_REPO_PRIVATE" == "1" || "$FORGEJO_REPO_PRIVATE" == "true" ]]; then
  forgejo_args+=(--repo-private)
 fi
 python3 "$ROOT_DIR/scripts/deploy/forgejo_repo_bootstrap.py" \
  --forgejo-url "$FORGEJO_URL" \
  --repo-owner "$FORGEJO_REPO_OWNER" \
  --repo-name "$FORGEJO_REPO_NAME" \
  --ci-kubeconfig "$CI_KUBECONFIG_PATH" \
  --registry-host "$REGISTRY_HOST" \
  --project-name "$PROJECT_NAME" \
  --project-namespace "$PROJECT_NAMESPACE" \
  --project-deployments "$PROJECT_DEPLOYMENTS" \
  --project-registry-secret-name "$PROJECT_REGISTRY_SECRET_NAME" \
  "${forgejo_args[@]}"
 echo "bootstrap complete for ${FORGEJO_REPO_OWNER}/${FORGEJO_REPO_NAME} in namespace ${PROJECT_NAMESPACE}"
--- a/scripts/deploy/forgejo_repo_bootstrap.py
+++ b/scripts/deploy/forgejo_repo_bootstrap.py
@ -0,0 +1,121 @@
 #!/usr/bin/env python3
 """Orderbooks-specific Forgejo repo bootstrap.
 Creates/updates the Forgejo repository plus Actions settings for the Kubernetes
 orderbooks deployment. This script deliberately does not print secret values.
 """
 from __future__ import annotations
 import argparse
 import base64
 import json
 import ssl
 import urllib.error
 import urllib.parse
 import urllib.request
 from pathlib import Path
 class ForgejoClient:
    def __init__(self, base_url: str, username: str | None = None, password: str | None = None, token: str | None = None):
        self.base_url = base_url.rstrip('/')
        self.username = username or ''
        self.headers = {'Accept': 'application/json', 'Content-Type': 'application/json'}
        if token:
            self.headers['Authorization'] = f'token {token}'
        elif username is not None and password is not None:
            credentials = base64.b64encode(f'{username}:{password}'.encode()).decode()
            self.headers['Authorization'] = f'Basic {credentials}'
        else:
            raise ValueError('ForgejoClient requires either token auth or username/password auth')
        self.ssl_context = ssl.create_default_context()
    def request(self, method: str, path: str, payload=None, expected=(200, 201, 204)):
        data = json.dumps(payload).encode() if payload is not None else None
        req = urllib.request.Request(f'{self.base_url}{path}', data=data, method=method)
        for key, value in self.headers.items():
            req.add_header(key, value)
        try:
            with urllib.request.urlopen(req, context=self.ssl_context) as response:
                body = response.read().decode() if response.length != 0 else ''
                if response.status not in expected:
                    raise RuntimeError(f'{method} {path} returned {response.status}: {body[:500]}')
                return json.loads(body) if body else None
        except urllib.error.HTTPError as exc:
            body = exc.read().decode()
            if exc.code not in expected:
                raise RuntimeError(f'{method} {path} returned {exc.code}: {body[:500]}') from exc
            return json.loads(body) if body else None
    def get_repo(self, owner: str, repo: str):
        try:
            return self.request('GET', f'/api/v1/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(repo)}')
        except RuntimeError as exc:
            if ' returned 404:' in str(exc):
                return None
            raise
    def create_repo(self, owner: str, name: str, private: bool):
        payload = {'name': name, 'private': private, 'auto_init': False, 'default_branch': 'main'}
        if owner == self.username:
            return self.request('POST', '/api/v1/user/repos', payload, expected=(201,))
        return self.request('POST', f'/api/v1/orgs/{urllib.parse.quote(owner)}/repos', payload, expected=(201,))
    def upsert_variable(self, owner: str, repo: str, name: str, value: str):
        path = f'/api/v1/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(repo)}/actions/variables/{urllib.parse.quote(name)}'
        try:
            self.request('POST', path, {'value': value}, expected=(201, 204))
        except RuntimeError as exc:
            if ' returned 409:' not in str(exc) and ' returned 422:' not in str(exc):
                raise
            self.request('PUT', path, {'value': value}, expected=(201, 204))
    def upsert_secret(self, owner: str, repo: str, name: str, value: str):
        path = f'/api/v1/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(repo)}/actions/secrets/{urllib.parse.quote(name)}'
        self.request('PUT', path, {'data': value}, expected=(201, 204))
 def main() -> None:
    parser = argparse.ArgumentParser(description='Bootstrap Forgejo Actions settings for orderbooks')
    parser.add_argument('--forgejo-url', required=True)
    parser.add_argument('--admin-username')
    parser.add_argument('--admin-password')
    parser.add_argument('--token')
    parser.add_argument('--repo-owner', required=True)
    parser.add_argument('--repo-name', required=True)
    parser.add_argument('--repo-private', action='store_true')
    parser.add_argument('--ci-kubeconfig', required=True)
    parser.add_argument('--registry-host', required=True)
    parser.add_argument('--project-name', required=True)
    parser.add_argument('--project-namespace', required=True)
    parser.add_argument('--project-deployments', required=True)
    parser.add_argument('--project-registry-secret-name', required=True)
    args = parser.parse_args()
    client = ForgejoClient(args.forgejo_url, args.admin_username, args.admin_password, args.token)
    repo = client.get_repo(args.repo_owner, args.repo_name)
    if repo is None:
        created = client.create_repo(args.repo_owner, args.repo_name, args.repo_private)
        print(f'created repo {created["full_name"]}')
    else:
        print(f'repo already exists: {repo["full_name"]}')
    kubeconfig_b64 = base64.b64encode(Path(args.ci_kubeconfig).read_bytes()).decode()
    client.upsert_secret(args.repo_owner, args.repo_name, 'KUBECONFIG_B64', kubeconfig_b64)
    print('upserted repo action secret KUBECONFIG_B64')
    variables = {
        'REGISTRY_HOST': args.registry_host,
        'PROJECT_NAME': args.project_name,
        'PROJECT_NAMESPACE': args.project_namespace,
        'PROJECT_DEPLOYMENTS': args.project_deployments,
        'PROJECT_REGISTRY_SECRET_NAME': args.project_registry_secret_name,
    }
    for name, value in variables.items():
        client.upsert_variable(args.repo_owner, args.repo_name, name, value)
    print('upserted repo action variables')
 if __name__ == '__main__':
    main()
--- a/scripts/discover_polymarket_btc_markets.py
+++ b/scripts/discover_polymarket_btc_markets.py
@ -0,0 +1,752 @@
 #!/usr/bin/env python3
 """Discover active Polymarket BTC up/down markets.
 Checkpoint 3 scope: fetch bounded public Gamma metadata, preserve raw responses,
 and write normalized market records with outcome-token mappings. This is not an
 order-book collector.
 """
 from __future__ import annotations
 import argparse
 import datetime as dt
 import hashlib
 import json
 import sys
 import time
 import urllib.error
 import urllib.parse
 import urllib.request
 from pathlib import Path
 from typing import Any
 GAMMA_EVENTS_URL = "https://gamma-api.polymarket.com/events"
 BTC_TAG_ID = 235
 DEFAULT_OUTPUT_JSON = Path("data/discovery/polymarket_btc_markets_latest.json")
 DEFAULT_MANIFEST = Path("data/discovery/polymarket_btc_markets_manifest.json")
 DEFAULT_MARKDOWN = Path("data/discovery/polymarket_btc_markets.md")
 SAFE_RESPONSE_HEADERS = {
    "age",
    "cache-control",
    "cf-cache-status",
    "cf-ray",
    "content-encoding",
    "content-length",
    "content-type",
    "date",
    "expires",
    "last-modified",
    "ratelimit-limit",
    "ratelimit-remaining",
    "ratelimit-reset",
    "retry-after",
    "server",
    "strict-transport-security",
    "x-ratelimit-limit",
    "x-ratelimit-remaining",
    "x-ratelimit-reset",
 }
 FILTER_RULES = [
    "Use public Gamma /events with tag_id=235, related_tags=true, active=true, closed=false.",
    "Require event.active=true and event.closed=false.",
    "Require market.active=true and market.closed=false.",
    "Require market.enableOrderBook=true.",
    "Require market.acceptingOrders=true unless --allow-non-accepting-orders is used.",
    "Require market end time to be after the fetch time unless --allow-expired is used.",
    "Require outcomes to resolve to exactly Up and Down.",
    "Require clobTokenIds to resolve to exactly two token IDs.",
    "Require BTC/up-down evidence from seriesSlug, title/slug text, or tags.",
 ]
 def utc_now() -> dt.datetime:
    return dt.datetime.now(dt.UTC)
 def iso_z(value: dt.datetime | None = None) -> str:
    value = value or utc_now()
    return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
 def parse_iso(value: Any) -> dt.datetime | None:
    if not isinstance(value, str) or not value.strip():
        return None
    text = value.strip()
    if text.endswith("Z"):
        text = text[:-1] + "+00:00"
    try:
        parsed = dt.datetime.fromisoformat(text)
    except ValueError:
        return None
    if parsed.tzinfo is None:
        parsed = parsed.replace(tzinfo=dt.UTC)
    return parsed.astimezone(dt.UTC)
 def sha256_file(path: Path) -> str:
    digest = hashlib.sha256()
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            digest.update(chunk)
    return digest.hexdigest()
 def filter_headers(headers: Any) -> dict[str, str]:
    safe: dict[str, str] = {}
    for key, value in dict(headers).items():
        if key.lower() in SAFE_RESPONSE_HEADERS:
            safe[key] = value
    return safe
 def normalize_params(params: dict[str, Any]) -> dict[str, Any]:
    normalized: dict[str, Any] = {}
    for key, value in params.items():
        if isinstance(value, bool):
            normalized[key] = "true" if value else "false"
        else:
            normalized[key] = value
    return normalized
 def build_url(url: str, params: dict[str, Any]) -> str:
    query = urllib.parse.urlencode(normalize_params(params), doseq=True)
    return f"{url}?{query}"
 def fetch_json_page(
    *,
    name: str,
    url: str,
    params: dict[str, Any],
    timeout_seconds: float,
 ) -> dict[str, Any]:
    started_monotonic = time.monotonic()
    started_at_utc = iso_z()
    full_url = build_url(url, params)
    request = urllib.request.Request(
        full_url,
        headers={
            "Accept": "application/json",
            "User-Agent": "orderbooks-checkpoint-3-discovery/1.0",
        },
        method="GET",
    )
    status_code: int | None = None
    response_headers: dict[str, str] = {}
    response_text = ""
    error: str | None = None
    try:
        with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
            status_code = response.status
            response_headers = filter_headers(response.headers)
            response_text = response.read().decode("utf-8", errors="replace")
    except urllib.error.HTTPError as exc:
        status_code = exc.code
        response_headers = filter_headers(exc.headers)
        response_text = exc.read().decode("utf-8", errors="replace")
        error = f"HTTPError: {exc}"
    except Exception as exc:  # noqa: BLE001 - preserve probe failure evidence
        error = f"{type(exc).__name__}: {exc}"
    response_json: Any | None = None
    json_error: str | None = None
    if response_text:
        try:
            response_json = json.loads(response_text)
        except json.JSONDecodeError as exc:
            json_error = str(exc)
    return {
        "name": name,
        "started_at_utc": started_at_utc,
        "ended_at_utc": iso_z(),
        "duration_ms": round((time.monotonic() - started_monotonic) * 1000, 3),
        "request": {
            "method": "GET",
            "url": url,
            "full_url": full_url,
            "params": normalize_params(params),
        },
        "response": {
            "status_code": status_code,
            "headers": response_headers,
            "json": response_json,
            "json_error": json_error,
            "text_preview": response_text[:1000] if response_json is None else None,
        },
        "ok": error is None and status_code is not None and 200 <= status_code < 300,
        "error": error,
    }
 def coerce_json_array(value: Any) -> list[Any]:
    if isinstance(value, list):
        return value
    if isinstance(value, str):
        try:
            parsed = json.loads(value)
        except json.JSONDecodeError:
            return []
        return parsed if isinstance(parsed, list) else []
    return []
 def lower_text(value: Any) -> str:
    return str(value or "").lower()
 def event_tag_text(event: dict[str, Any]) -> str:
    parts: list[str] = []
    for tag in event.get("tags") or []:
        if isinstance(tag, dict):
            parts.append(str(tag.get("slug") or ""))
            parts.append(str(tag.get("label") or ""))
    return " ".join(parts).lower()
 def has_btc_up_down_evidence(event: dict[str, Any], market: dict[str, Any]) -> bool:
    series_slug = lower_text(event.get("seriesSlug"))
    text = " ".join(
        lower_text(event.get(key))
        for key in ("title", "slug", "ticker", "description")
    )
    text += " " + " ".join(
        lower_text(market.get(key))
        for key in ("question", "slug", "description")
    )
    tags = event_tag_text(event)
    series_match = series_slug.startswith("btc-up-or-down")
    text_match = ("bitcoin" in text or "btc" in text) and "up" in text and "down" in text
    tag_match = ("bitcoin" in tags or "btc" in tags) and "up-or-down" in tags
    return bool(series_match or text_match or tag_match)
 def is_up_down_outcomes(outcomes: list[str]) -> bool:
    return len(outcomes) == 2 and {item.lower() for item in outcomes} == {"up", "down"}
 def normalize_market(
    *,
    event: dict[str, Any],
    market: dict[str, Any],
    page_index: int,
    event_index: int,
    market_index: int,
    fetched_at_utc: str,
    output_json_path: Path,
 ) -> dict[str, Any]:
    outcomes = [str(item) for item in coerce_json_array(market.get("outcomes"))]
    token_ids = [str(item) for item in coerce_json_array(market.get("clobTokenIds"))]
    tokens = [
        {
            "outcome": outcomes[index],
            "token_id": token_ids[index],
            "outcome_index": index,
        }
        for index in range(min(len(outcomes), len(token_ids)))
    ]
    start_time = (
        market.get("startDate")
        or market.get("startDateIso")
        or event.get("startDate")
        or event.get("creationDate")
    )
    end_time = market.get("endDate") or market.get("endDateIso") or event.get("endDate")
    event_slug = event.get("slug")
    market_slug = market.get("slug") or event_slug
    return {
        "market_name": "polymarket",
        "market_slug": market_slug,
        "event_slug": event_slug,
        "title": event.get("title") or market.get("question"),
        "question": market.get("question") or event.get("title"),
        "condition_id": market.get("conditionId"),
        "tokens": tokens,
        "outcomes": outcomes,
        "start_time_utc": iso_z(parse_iso(start_time)) if parse_iso(start_time) else start_time,
        "end_time_utc": iso_z(parse_iso(end_time)) if parse_iso(end_time) else end_time,
        "active": market.get("active"),
        "closed": market.get("closed"),
        "event_active": event.get("active"),
        "event_closed": event.get("closed"),
        "accepting_orders": market.get("acceptingOrders"),
        "enable_order_book": market.get("enableOrderBook"),
        "endpoint_source": {
            "name": "gamma_events_bitcoin_tag",
            "method": "GET",
            "url": GAMMA_EVENTS_URL,
            "params_basis": {
                "tag_id": BTC_TAG_ID,
                "related_tags": "true",
                "active": "true",
                "closed": "false",
                "order": "endDate",
                "ascending": "true",
            },
        },
        "fetched_at_utc": fetched_at_utc,
        "raw_ref": {
            "artifact_path": output_json_path.as_posix(),
            "section": "raw.gamma_events_pages",
            "page_index": page_index,
            "event_index": event_index,
            "market_index": market_index,
            "json_path": f"raw.gamma_events_pages[{page_index}].response.json[{event_index}].markets[{market_index}]",
        },
    }
 def rejection_reasons(
    *,
    event: dict[str, Any],
    market: dict[str, Any],
    fetched_at: dt.datetime,
    require_accepting_orders: bool,
    require_future_end: bool,
 ) -> list[str]:
    reasons: list[str] = []
    outcomes = [str(item) for item in coerce_json_array(market.get("outcomes"))]
    token_ids = [str(item) for item in coerce_json_array(market.get("clobTokenIds"))]
    end_time = parse_iso(market.get("endDate") or event.get("endDate"))
    if event.get("active") is not True:
        reasons.append("event_not_active")
    if event.get("closed") is not False:
        reasons.append("event_closed")
    if market.get("active") is not True:
        reasons.append("market_not_active")
    if market.get("closed") is not False:
        reasons.append("market_closed")
    if market.get("enableOrderBook") is not True:
        reasons.append("order_book_not_enabled")
    if require_accepting_orders and market.get("acceptingOrders") is not True:
        reasons.append("not_accepting_orders")
    if require_future_end and (end_time is None or end_time <= fetched_at):
        reasons.append("not_future_end")
    if not is_up_down_outcomes(outcomes):
        reasons.append("not_up_down_outcomes")
    if len(token_ids) != 2:
        reasons.append("missing_two_clob_token_ids")
    if not has_btc_up_down_evidence(event, market):
        reasons.append("missing_btc_up_down_evidence")
    return reasons
 def discover(args: argparse.Namespace) -> dict[str, Any]:
    started_at_utc = iso_z()
    fetched_at = utc_now()
    fetched_at_utc = iso_z(fetched_at)
    raw_pages: list[dict[str, Any]] = []
    normalized: list[dict[str, Any]] = []
    rejected_counts: dict[str, int] = {}
    warnings: list[str] = []
    seen_conditions: set[str] = set()
    for page_index in range(args.max_pages):
        offset = page_index * args.limit
        params = {
            "tag_id": BTC_TAG_ID,
            "related_tags": True,
            "active": True,
            "closed": False,
            "limit": args.limit,
            "offset": offset,
            "order": "endDate",
            "ascending": True,
        }
        page = fetch_json_page(
            name=f"gamma_events_bitcoin_tag_page_{page_index}",
            url=GAMMA_EVENTS_URL,
            params=params,
            timeout_seconds=args.timeout,
        )
        raw_pages.append(page)
        payload = page["response"]["json"]
        if not page["ok"]:
            warnings.append(
                f"Page {page_index} request failed with status {page['response']['status_code']}: {page['error']}"
            )
            break
        if not isinstance(payload, list):
            warnings.append(f"Page {page_index} response was not a JSON list.")
            break
        for event_index, event in enumerate(payload):
            if not isinstance(event, dict):
                rejected_counts["event_not_object"] = rejected_counts.get("event_not_object", 0) + 1
                continue
            markets = event.get("markets") or []
            if not isinstance(markets, list) or not markets:
                rejected_counts["missing_markets"] = rejected_counts.get("missing_markets", 0) + 1
                continue
            for market_index, market in enumerate(markets):
                if not isinstance(market, dict):
                    rejected_counts["market_not_object"] = rejected_counts.get("market_not_object", 0) + 1
                    continue
                reasons = rejection_reasons(
                    event=event,
                    market=market,
                    fetched_at=fetched_at,
                    require_accepting_orders=not args.allow_non_accepting_orders,
                    require_future_end=not args.allow_expired,
                )
                if reasons:
                    for reason in reasons:
                        rejected_counts[reason] = rejected_counts.get(reason, 0) + 1
                    continue
                condition_id = str(market.get("conditionId") or "")
                if condition_id in seen_conditions:
                    rejected_counts["duplicate_condition_id"] = rejected_counts.get(
                        "duplicate_condition_id", 0
                    ) + 1
                    continue
                seen_conditions.add(condition_id)
                normalized.append(
                    normalize_market(
                        event=event,
                        market=market,
                        page_index=page_index,
                        event_index=event_index,
                        market_index=market_index,
                        fetched_at_utc=fetched_at_utc,
                        output_json_path=args.output_json,
                    )
                )
        if len(payload) < args.limit:
            break
    normalized.sort(key=lambda item: (item.get("end_time_utc") or "", item.get("market_slug") or ""))
    if raw_pages:
        last_payload = raw_pages[-1]["response"].get("json")
        if isinstance(last_payload, list) and len(last_payload) == args.limit and len(raw_pages) >= args.max_pages:
            warnings.append(
                "Discovery stopped at max_pages before exhausting Gamma pagination; output is bounded to the fetched pages."
            )
    if len(normalized) < args.min_markets:
        warnings.append(
            f"Only {len(normalized)} markets passed filters; min_markets={args.min_markets}."
        )
    status = "PASS" if len(normalized) >= args.min_markets else "FAIL"
    status_reason = (
        f"Discovered {len(normalized)} active BTC up/down markets with condition IDs and two token IDs."
        if status == "PASS"
        else "Did not discover enough active BTC up/down markets with condition IDs and two token IDs."
    )
    return {
        "schema_name": "polymarket_btc_market_discovery",
        "schema_version": 1,
        "artifact_status": "valid" if status == "PASS" else "partial",
        "checkpoint_id": 3,
        "checkpoint_name": "Minimal BTC Market Discovery",
        "started_at_utc": started_at_utc,
        "ended_at_utc": iso_z(),
        "fetched_at_utc": fetched_at_utc,
        "scope": "Bounded public Gamma metadata discovery only; no order-book collector.",
        "endpoint_basis": {
            "source_checkpoint": "Checkpoint 2",
            "source_report": "reports/checkpoints/checkpoint_002_polymarket_public_sources.md",
            "endpoint": GAMMA_EVENTS_URL,
            "method": "GET",
            "base_params": {
                "tag_id": BTC_TAG_ID,
                "related_tags": True,
                "active": True,
                "closed": False,
                "limit": args.limit,
                "order": "endDate",
                "ascending": True,
            },
        },
        "filter_rules": FILTER_RULES,
        "normalized_markets": normalized,
        "raw": {
            "gamma_events_pages": raw_pages,
        },
        "summary": {
            "status": status,
            "status_reason": status_reason,
            "raw_pages_fetched": len(raw_pages),
            "raw_events_fetched": sum(
                len(page["response"].get("json") or [])
                for page in raw_pages
                if isinstance(page["response"].get("json"), list)
            ),
            "normalized_market_count": len(normalized),
            "rejected_counts": dict(sorted(rejected_counts.items())),
            "warnings": warnings,
        },
        "fake_progress_risk": "Discovery can appear successful while silently missing markets if filters rely on stale text assumptions or bounded pagination. Raw pages and rejection counts are preserved so missed-market risk can be audited.",
        "next_step": "Checkpoint 4 should use this discovery output as input for a short, raw-first order-book snapshot sample; do not claim reliability until the later 24h soak test.",
    }
 def markdown_table_row(values: list[Any]) -> str:
    return "| " + " | ".join(str(value).replace("\n", " ") for value in values) + " |"
 def write_markdown(discovery: dict[str, Any], path: Path) -> None:
    summary = discovery["summary"]
    rows = discovery["normalized_markets"]
    lines = [
        "# Polymarket BTC Markets Discovery",
        "",
        f"Artifact status: `{discovery['artifact_status']}`",
        "",
        "## Gate",
        "",
        f"Status: `{summary['status']}`",
        "",
        summary["status_reason"],
        "",
        "## Scope",
        "",
        "Bounded public Gamma metadata discovery only. No order-book collection, no trading, no private endpoints, no secrets.",
        "",
        "## Endpoint",
        "",
        f"- `GET {GAMMA_EVENTS_URL}`",
        "- Params: `tag_id=235`, `related_tags=true`, `active=true`, `closed=false`, `order=endDate`, `ascending=true`, bounded by `limit` and `max_pages`.",
        "",
        "## Summary",
        "",
        markdown_table_row(["Metric", "Value"]),
        markdown_table_row(["---", "---"]),
        markdown_table_row(["fetched_at_utc", discovery["fetched_at_utc"]]),
        markdown_table_row(["raw_pages_fetched", summary["raw_pages_fetched"]]),
        markdown_table_row(["raw_events_fetched", summary["raw_events_fetched"]]),
        markdown_table_row(["normalized_market_count", summary["normalized_market_count"]]),
        "",
        "## Markets",
        "",
        markdown_table_row(
            [
                "market_slug",
                "end_time_utc",
                "condition_id",
                "outcomes",
                "token_ids",
                "accepting_orders",
            ]
        ),
        markdown_table_row(["---", "---", "---", "---", "---", "---"]),
    ]
    for row in rows:
        token_ids = [token["token_id"] for token in row["tokens"]]
        lines.append(
            markdown_table_row(
                [
                    row.get("market_slug"),
                    row.get("end_time_utc"),
                    row.get("condition_id"),
                    json.dumps(row.get("outcomes")),
                    json.dumps(token_ids),
                    row.get("accepting_orders"),
                ]
            )
        )
    lines.extend(
        [
            "",
            "## Warnings",
            "",
        ]
    )
    if summary["warnings"]:
        for warning in summary["warnings"]:
            lines.append(f"- {warning}")
    else:
        lines.append("- None.")
    lines.extend(
        [
            "",
            "## Rejection Counts",
            "",
            "```json",
            json.dumps(summary["rejected_counts"], indent=2, sort_keys=True),
            "```",
            "",
            "## Raw Preservation",
            "",
            "The latest JSON artifact stores raw Gamma response envelopes under `raw.gamma_events_pages`. Each normalized record has a `raw_ref` pointing back to the source event market.",
            "",
            "## Strongest Fake-Progress Risk",
            "",
            discovery["fake_progress_risk"],
            "",
            "## Next Smallest Step",
            "",
            discovery["next_step"],
            "",
        ]
    )
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("\n".join(lines), encoding="utf-8")
 def write_manifest(
    *,
    discovery: dict[str, Any],
    manifest_path: Path,
    output_json: Path,
    markdown_path: Path,
    command: str,
 ) -> None:
    status = discovery["summary"]["status"]
    output_files = [
        {
            "path": output_json.as_posix(),
            "kind": "latest_discovery_json",
            "status": "valid" if output_json.exists() and output_json.stat().st_size else "missing",
            "sha256": sha256_file(output_json) if output_json.exists() else None,
        },
        {
            "path": markdown_path.as_posix(),
            "kind": "discovery_markdown",
            "status": "valid" if markdown_path.exists() and markdown_path.stat().st_size else "missing",
            "sha256": sha256_file(markdown_path) if markdown_path.exists() else None,
        },
    ]
    script_path = Path("scripts/discover_polymarket_btc_markets.py")
    if script_path.exists():
        output_files.append(
            {
                "path": script_path.as_posix(),
                "kind": "discovery_script",
                "status": "valid",
                "sha256": sha256_file(script_path),
            }
        )
    status_codes: dict[str, int] = {}
    for page in discovery["raw"]["gamma_events_pages"]:
        code = str(page["response"].get("status_code"))
        status_codes[code] = status_codes.get(code, 0) + 1
    manifest = {
        "schema_name": "polymarket_btc_markets_manifest",
        "schema_version": 1,
        "checkpoint_id": 3,
        "checkpoint_name": "Minimal BTC Market Discovery",
        "status": status,
        "started_at_utc": discovery["started_at_utc"],
        "ended_at_utc": discovery["ended_at_utc"],
        "scope": discovery["scope"],
        "command": command,
        "endpoint": discovery["endpoint_basis"],
        "request_counts": {
            "gamma_events_pages": discovery["summary"]["raw_pages_fetched"],
            "status_code_counts": dict(sorted(status_codes.items())),
        },
        "row_counts": {
            "raw_events_fetched": discovery["summary"]["raw_events_fetched"],
            "normalized_markets": discovery["summary"]["normalized_market_count"],
        },
        "market_ids": [
            {
                "market_slug": row.get("market_slug"),
                "condition_id": row.get("condition_id"),
                "token_ids": [token.get("token_id") for token in row.get("tokens", [])],
            }
            for row in discovery["normalized_markets"]
        ],
        "output_files": output_files,
        "warnings": discovery["summary"]["warnings"],
        "validation": {
            "summary": discovery["summary"]["status_reason"],
            "required_record_fields": [
                "market_name",
                "market_slug",
                "question",
                "condition_id",
                "tokens",
                "outcomes",
                "start_time_utc",
                "end_time_utc",
                "active",
                "closed",
                "accepting_orders",
                "enable_order_book",
                "endpoint_source",
                "fetched_at_utc",
                "raw_ref",
            ],
        },
        "fake_progress_risk": discovery["fake_progress_risk"],
        "next_step": discovery["next_step"],
    }
    manifest_path.parent.mkdir(parents=True, exist_ok=True)
    manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
 def write_outputs(args: argparse.Namespace, discovery: dict[str, Any]) -> None:
    args.output_json.parent.mkdir(parents=True, exist_ok=True)
    args.output_json.write_text(
        json.dumps(discovery, indent=2, sort_keys=True) + "\n",
        encoding="utf-8",
    )
    write_markdown(discovery, args.markdown)
    command = " ".join([Path(sys.argv[0]).as_posix(), *sys.argv[1:]])
    write_manifest(
        discovery=discovery,
        manifest_path=args.manifest,
        output_json=args.output_json,
        markdown_path=args.markdown,
        command=command,
    )
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Discover active BTC up/down Polymarket markets from public Gamma metadata."
    )
    parser.add_argument("--output-json", type=Path, default=DEFAULT_OUTPUT_JSON)
    parser.add_argument("--manifest", type=Path, default=DEFAULT_MANIFEST)
    parser.add_argument("--markdown", type=Path, default=DEFAULT_MARKDOWN)
    parser.add_argument("--limit", type=int, default=100)
    parser.add_argument("--max-pages", type=int, default=3)
    parser.add_argument("--timeout", type=float, default=15.0)
    parser.add_argument("--min-markets", type=int, default=1)
    parser.add_argument("--allow-expired", action="store_true")
    parser.add_argument("--allow-non-accepting-orders", action="store_true")
    return parser.parse_args()
 def main() -> int:
    args = parse_args()
    discovery = discover(args)
    write_outputs(args, discovery)
    print(
        json.dumps(
            {
                "status": discovery["summary"]["status"],
                "status_reason": discovery["summary"]["status_reason"],
                "output_json": args.output_json.as_posix(),
                "manifest": args.manifest.as_posix(),
                "markdown": args.markdown.as_posix(),
                "normalized_market_count": discovery["summary"]["normalized_market_count"],
                "markets": [
                    {
                        "market_slug": row.get("market_slug"),
                        "condition_id": row.get("condition_id"),
                        "token_ids": [token.get("token_id") for token in row.get("tokens", [])],
                        "end_time_utc": row.get("end_time_utc"),
                    }
                    for row in discovery["normalized_markets"]
                ],
                "warnings": discovery["summary"]["warnings"],
            },
            indent=2,
            sort_keys=True,
        )
    )
    return 0 if discovery["summary"]["status"] == "PASS" else 1
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/scripts/k8s_runtime_smoke_check.sh
+++ b/scripts/k8s_runtime_smoke_check.sh
@ -0,0 +1,466 @@
 #!/usr/bin/env bash
 set -uo pipefail
 NAMESPACE="${ORDERBOOKS_K8S_NAMESPACE:-orderbooks}"
 DEPLOYMENT="${ORDERBOOKS_K8S_COLLECTOR_DEPLOYMENT:-orderbooks-collector}"
 CRONJOB="${ORDERBOOKS_K8S_UPLOADER_CRONJOB:-orderbooks-uploader}"
 RAW_DIR="${ORDERBOOKS_K8S_RAW_DIR:-/var/lib/orderbooks/raw_orderbooks}"
 MANIFEST_DIR="${ORDERBOOKS_K8S_MANIFEST_DIR:-/var/lib/orderbooks/manifests}"
 WAIT_SECONDS="${ORDERBOOKS_K8S_SMOKE_WAIT_SECONDS:-1200}"
 UPLOAD_MIN_AGE_SECONDS="${ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS:-600}"
 KUBECTL_BIN="${ORDERBOOKS_KUBECTL:-kubectl}"
 RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
 EVIDENCE_PATH="${ORDERBOOKS_K8S_SMOKE_EVIDENCE_PATH:-data/manifests/k8s_runtime_smoke_${RUN_ID}.json}"
 usage() {
  cat <<'EOF'
 Usage: scripts/k8s_runtime_smoke_check.sh [options]
 Run after the orderbooks Kubernetes workload is deployed. The script uses
 kubectl, writes local JSON evidence, deletes one collector pod to force a
 Deployment restart, verifies raw gzip JSONL files and manifests on the PVC,
 then triggers the uploader CronJob and requires a verified upload manifest.
 Options:
  --namespace NAME       Namespace. Default: orderbooks.
  --deployment NAME      Collector deployment. Default: orderbooks-collector.
  --cronjob NAME         Uploader CronJob. Default: orderbooks-uploader.
  --raw-dir PATH         Raw path inside collector pod. Default: /var/lib/orderbooks/raw_orderbooks.
  --manifest-dir PATH    Manifest path inside collector pod. Default: /var/lib/orderbooks/manifests.
  --wait-seconds N       Max wait for collector/upload evidence. Default: 1200.
  --upload-min-age-seconds N
                         Wait for at least one raw/manifest file to be this old before upload. Default: 600.
  --evidence-path PATH   Local JSON evidence path.
  --kubectl PATH         kubectl binary. Default: kubectl.
  --help                 Show this help.
 This script does not read or print rclone config contents.
 EOF
 }
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --namespace) NAMESPACE="$2"; shift 2 ;;
    --deployment) DEPLOYMENT="$2"; shift 2 ;;
    --cronjob) CRONJOB="$2"; shift 2 ;;
    --raw-dir) RAW_DIR="$2"; shift 2 ;;
    --manifest-dir) MANIFEST_DIR="$2"; shift 2 ;;
    --wait-seconds) WAIT_SECONDS="$2"; shift 2 ;;
    --upload-min-age-seconds) UPLOAD_MIN_AGE_SECONDS="$2"; shift 2 ;;
    --evidence-path) EVIDENCE_PATH="$2"; shift 2 ;;
    --kubectl) KUBECTL_BIN="$2"; shift 2 ;;
    --help) usage; exit 0 ;;
    *) echo "Unknown argument: $1" >&2; usage >&2; exit 2 ;;
  esac
 done
 mkdir -p "$(dirname "${EVIDENCE_PATH}")"
 PYTHONDONTWRITEBYTECODE=1 python3 - "$KUBECTL_BIN" "$NAMESPACE" "$DEPLOYMENT" "$CRONJOB" "$RAW_DIR" "$MANIFEST_DIR" "$WAIT_SECONDS" "$UPLOAD_MIN_AGE_SECONDS" "$EVIDENCE_PATH" <<'PY_SMOKE'
 import datetime as dt
 import json
 import subprocess
 import sys
 import time
 from pathlib import Path
 kubectl = sys.argv[1]
 namespace = sys.argv[2]
 deployment = sys.argv[3]
 cronjob = sys.argv[4]
 raw_dir = sys.argv[5]
 manifest_dir = sys.argv[6]
 wait_seconds = int(sys.argv[7])
 upload_min_age_seconds = int(sys.argv[8])
 evidence_path = Path(sys.argv[9])
 started_at = dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z')
 checks = []
 failures = []
 def iso_now():
    return dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z')
 def capture(command, input_text=None, timeout=None):
    proc = subprocess.run(command, input=input_text, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
    item = {
        'command': command,
        'exit_code': proc.returncode,
        'stdout_tail': proc.stdout[-6000:],
        'stderr_tail': proc.stderr[-6000:],
        'ran_at_utc': iso_now(),
    }
    checks.append(item)
    return proc, item
 def run(command, input_text=None, timeout=None):
    _proc, item = capture(command, input_text=input_text, timeout=timeout)
    return item
 def run_json(command, input_text=None, timeout=None):
    proc, item = capture(command, input_text=input_text, timeout=timeout)
    if item['exit_code'] != 0:
        raise RuntimeError(f"command failed: {' '.join(command)}")
    return json.loads(proc.stdout)
 def pod_ready(pod):
    if pod.get('status', {}).get('phase') != 'Running':
        return False
    statuses = pod.get('status', {}).get('containerStatuses') or []
    return bool(statuses) and all(status.get('ready') for status in statuses)
 def get_collector_pod():
    selector = 'app.kubernetes.io/name=orderbooks,app.kubernetes.io/component=collector'
    deadline = time.time() + wait_seconds
    last = None
    while time.time() <= deadline:
        pods = run_json([kubectl, '-n', namespace, 'get', 'pods', '-l', selector, '-o', 'json'])
        items = pods.get('items', [])
        ready = [pod for pod in items if pod_ready(pod)]
        if ready:
            ready.sort(key=lambda pod: pod.get('metadata', {}).get('creationTimestamp', ''))
            return ready[-1]['metadata']['name'], ready[-1]
        last = items
        time.sleep(10)
    raise TimeoutError(f'no ready collector pod found; last pods={last}')
 def exec_python(pod, code, args):
    command = [kubectl, '-n', namespace, 'exec', '-i', pod, '--', 'python3', '-', *args]
    proc, item = capture(command, input_text=code, timeout=wait_seconds + 60)
    if item['exit_code'] != 0:
        raise RuntimeError(f"pod python command failed in {pod}: {item['stderr_tail']}")
    return json.loads(proc.stdout)
 def wait_for_valid_collector(pod, after_mtime, label):
    deadline = time.time() + wait_seconds
    last_error = None
    while time.time() <= deadline:
        try:
            result = exec_python(pod, collector_validation_code, [manifest_dir, raw_dir, str(after_mtime)])
            if result.get('valid'):
                result['wait_label'] = label
                return result
            last_error = result
        except Exception as exc:
            last_error = repr(exc)
        time.sleep(15)
    raise TimeoutError(f'no valid {label} collector manifest found before timeout: {last_error}')
 def wait_for_upload_eligible_files(pod):
    deadline = time.time() + wait_seconds
    last = None
    while time.time() <= deadline:
        result = exec_python(pod, upload_eligibility_code, [raw_dir, manifest_dir, str(upload_min_age_seconds)])
        if result.get('eligible'):
            return result
        last = result
        time.sleep(15)
    raise TimeoutError(f'no upload-eligible raw/manifest files before timeout: {last}')
 collector_validation_code = r'''
 import gzip
 import hashlib
 import json
 import sys
 from pathlib import Path
 manifest_dir = Path(sys.argv[1])
 raw_dir = Path(sys.argv[2])
 after_mtime = float(sys.argv[3])
 def sha256(path):
    digest = hashlib.sha256()
    with path.open('rb') as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b''):
            digest.update(chunk)
    return digest.hexdigest()
 def parse_raw(path):
    rows = 0
    first_keys = []
    with gzip.open(path, 'rt', encoding='utf-8') as handle:
        for line in handle:
            if not line.strip():
                continue
            obj = json.loads(line)
            if rows == 0:
                first_keys = sorted(obj.keys())
            rows += 1
    return rows, first_keys
 def validate(path):
    manifest = json.loads(path.read_text(encoding='utf-8'))
    output_files = []
    for item in manifest.get('output_files', []):
        raw_path = Path(item['path'])
        rows, first_keys = parse_raw(raw_path)
        actual_sha = sha256(raw_path)
        output_files.append({
            'path': str(raw_path),
            'bytes': raw_path.stat().st_size,
            'mtime': raw_path.stat().st_mtime,
            'manifest_rows': item.get('rows'),
            'rows_parsed': rows,
            'row_count_matches_manifest': rows == item.get('rows'),
            'manifest_sha256': item.get('sha256'),
            'actual_sha256': actual_sha,
            'sha256_matches_manifest': actual_sha == item.get('sha256'),
            'under_raw_dir': raw_path.resolve().is_relative_to(raw_dir.resolve()),
            'first_row_keys': first_keys,
        })
    valid = (
        manifest.get('gate_status') == 'PASS'
        and manifest.get('rows_written', 0) > 0
        and manifest.get('failure_count') == 0
        and not manifest.get('failures')
        and bool(output_files)
        and all(item['rows_parsed'] > 0 and item['row_count_matches_manifest'] and item['sha256_matches_manifest'] and item['under_raw_dir'] for item in output_files)
    )
    return {
        'path': str(path),
        'mtime': path.stat().st_mtime,
        'manifest_summary': {
            'gate_status': manifest.get('gate_status'),
            'rows_written': manifest.get('rows_written'),
            'failure_count': manifest.get('failure_count'),
            'failures_present': bool(manifest.get('failures')),
            'output_file_count': len(manifest.get('output_files', [])),
            'started_at_utc': manifest.get('started_at_utc'),
            'ended_at_utc': manifest.get('ended_at_utc'),
        },
        'output_files': output_files,
        'valid': valid,
    }
 candidates = sorted(manifest_dir.glob('polymarket_orderbook_collector_*.json'), key=lambda p: p.stat().st_mtime)
 candidates = [path for path in candidates if path.stat().st_mtime > after_mtime]
 latest = None
 for path in reversed(candidates):
    try:
        result = validate(path)
    except Exception as exc:
        latest = {'path': str(path), 'valid': False, 'error': repr(exc)}
        continue
    latest = result
    if result['valid']:
        print(json.dumps(result, sort_keys=True))
        sys.exit(0)
 print(json.dumps(latest or {'valid': False, 'error': 'no collector manifest candidates'}, sort_keys=True))
 sys.exit(2)
 '''
 raw_check_code = r'''
 import gzip
 import hashlib
 import json
 import sys
 from pathlib import Path
 path = Path(sys.argv[1])
 expected_sha = sys.argv[2]
 expected_rows = int(sys.argv[3])
 def sha256(path):
    digest = hashlib.sha256()
    with path.open('rb') as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b''):
            digest.update(chunk)
    return digest.hexdigest()
 rows = 0
 with gzip.open(path, 'rt', encoding='utf-8') as handle:
    for line in handle:
        if line.strip():
            json.loads(line)
            rows += 1
 actual_sha = sha256(path)
 print(json.dumps({
    'path': str(path),
    'expected_sha256': expected_sha,
    'actual_sha256': actual_sha,
    'sha256_matches': actual_sha == expected_sha,
    'expected_rows': expected_rows,
    'actual_rows': rows,
    'row_count_matches': rows == expected_rows,
 }, sort_keys=True))
 '''
 upload_validation_code = r'''
 import json
 import sys
 from pathlib import Path
 manifest_dir = Path(sys.argv[1])
 after_mtime = float(sys.argv[2])
 candidates = sorted(manifest_dir.glob('upload_archive_*.json'), key=lambda p: p.stat().st_mtime)
 candidates = [path for path in candidates if path.stat().st_mtime >= after_mtime]
 if not candidates:
    print(json.dumps({'valid': False, 'error': 'no upload manifest candidates'}, sort_keys=True))
    sys.exit(2)
 path = candidates[-1]
 manifest = json.loads(path.read_text(encoding='utf-8'))
 verified_count = manifest.get('counts', {}).get('verified', len(manifest.get('verified_files', [])))
 valid = (
    manifest.get('operation_status') == 'UPLOAD_VERIFIED'
    and manifest.get('gate_status') == 'PASS'
    and manifest.get('rclone', {}).get('copy_exit_code') == 0
    and manifest.get('rclone', {}).get('check_exit_code') == 0
    and verified_count > 0
 )
 verified_files = manifest.get('verified_files', [])
 print(json.dumps({
    'path': str(path),
    'mtime': path.stat().st_mtime,
    'manifest_summary': {
        'operation_status': manifest.get('operation_status'),
        'gate_status': manifest.get('gate_status'),
        'counts': manifest.get('counts', {}),
        'planned_file_count': len(manifest.get('planned_files', [])),
        'attempted_file_count': len(manifest.get('attempted_files', [])),
        'uploaded_file_count': len(manifest.get('uploaded_files', [])),
        'verified_file_count': verified_count,
        'rclone_copy_exit_code': manifest.get('rclone', {}).get('copy_exit_code'),
        'rclone_check_exit_code': manifest.get('rclone', {}).get('check_exit_code'),
        'started_at_utc': manifest.get('started_at_utc'),
        'ended_at_utc': manifest.get('ended_at_utc'),
    },
    'verified_count': verified_count,
    'verified_file_samples': [
        {
            'relative_path': item.get('relative_path'),
            'bytes': item.get('bytes'),
            'sha256': item.get('sha256'),
            'kind': item.get('kind'),
        }
        for item in verified_files[:5]
    ],
    'valid': valid,
 }, sort_keys=True))
 if not valid:
    sys.exit(2)
 '''
 upload_eligibility_code = r'''
 import json
 import sys
 import time
 from pathlib import Path
 raw_dir = Path(sys.argv[1])
 manifest_dir = Path(sys.argv[2])
 min_age_seconds = int(sys.argv[3])
 now = time.time()
 def eligible_files(root, pattern):
    if not root.exists():
        return []
    items = []
    for path in sorted(root.rglob(pattern)):
        if not path.is_file():
            continue
        age = max(0, int(now - path.stat().st_mtime))
        if age >= min_age_seconds:
            items.append({'path': str(path), 'bytes': path.stat().st_size, 'age_seconds': age})
    return items
 raw_files = eligible_files(raw_dir, '*.jsonl.gz')
 manifest_files = eligible_files(manifest_dir, 'polymarket_orderbook_collector_*.json')
 print(json.dumps({
    'eligible': bool(raw_files) and bool(manifest_files),
    'min_age_seconds': min_age_seconds,
    'raw_eligible_count': len(raw_files),
    'manifest_eligible_count': len(manifest_files),
    'raw_sample': raw_files[:3],
    'manifest_sample': manifest_files[:3],
 }, sort_keys=True))
 '''
 summary = {
    'schema_name': 'k8s_runtime_smoke_result',
    'schema_version': 1,
    'started_at_utc': started_at,
    'ended_at_utc': None,
    'gate_status': 'ERROR',
    'production_ready': False,
    'namespace': namespace,
    'deployment': deployment,
    'cronjob': cronjob,
    'raw_dir': raw_dir,
    'manifest_dir': manifest_dir,
    'upload_min_age_seconds': upload_min_age_seconds,
    'checks': checks,
    'failures': failures,
 }
 try:
    rollout = run([kubectl, '-n', namespace, 'rollout', 'status', f'deployment/{deployment}', f'--timeout={wait_seconds}s'])
    if rollout['exit_code'] != 0:
        raise RuntimeError('collector deployment rollout is not healthy')
    pod_name, pod_obj = get_collector_pod()
    before = wait_for_valid_collector(pod_name, 0, 'initial')
    before_mtime = before['mtime']
    old_file = before['output_files'][0]
    delete_pod = run([kubectl, '-n', namespace, 'delete', 'pod', pod_name, '--wait=false'])
    if delete_pod['exit_code'] != 0:
        raise RuntimeError('failed to delete collector pod for restart test')
    rollout_after = run([kubectl, '-n', namespace, 'rollout', 'status', f'deployment/{deployment}', f'--timeout={wait_seconds}s'])
    if rollout_after['exit_code'] != 0:
        raise RuntimeError('collector deployment did not recover after pod delete')
    new_pod, new_pod_obj = get_collector_pod()
    old_check = exec_python(new_pod, raw_check_code, [old_file['path'], old_file['actual_sha256'], str(old_file['rows_parsed'])])
    if not old_check.get('sha256_matches') or not old_check.get('row_count_matches'):
        raise RuntimeError('old raw file changed or stopped parsing after pod restart')
    after = wait_for_valid_collector(new_pod, before_mtime, 'post_restart')
    upload_eligibility = wait_for_upload_eligible_files(new_pod)
    upload_start_mtime = time.time() - 2
    job_name = 'orderbooks-uploader-smoke-' + dt.datetime.now(dt.UTC).strftime('%Y%m%dt%H%M%Sz').lower()
    run([kubectl, '-n', namespace, 'delete', 'job', job_name, '--ignore-not-found=true'])
    create_job = run([kubectl, '-n', namespace, 'create', 'job', job_name, f'--from=cronjob/{cronjob}'])
    if create_job['exit_code'] != 0:
        raise RuntimeError('failed to create uploader smoke job from CronJob')
    wait_upload = run([kubectl, '-n', namespace, 'wait', '--for=condition=Complete', f'--timeout={wait_seconds}s', f'job/{job_name}'])
    logs = run([kubectl, '-n', namespace, 'logs', f'job/{job_name}'])
    if wait_upload['exit_code'] != 0:
        raise RuntimeError('uploader smoke job did not complete')
    upload = exec_python(new_pod, upload_validation_code, [manifest_dir, str(upload_start_mtime)])
    if not upload.get('valid'):
        raise RuntimeError('upload manifest did not verify at least one file')
    summary.update({
        'initial_collector_pod': pod_name,
        'post_restart_collector_pod': new_pod,
        'before_restart_collector': before,
        'old_raw_file_after_restart': old_check,
        'after_restart_collector': after,
        'upload_eligibility': upload_eligibility,
        'uploader_job': job_name,
        'upload_result': upload,
        'uploader_log_check_exit_code': logs['exit_code'],
    })
    summary['gate_status'] = 'PASS'
 except Exception as exc:
    failures.append(str(exc))
    summary['exception'] = repr(exc)
    summary['gate_status'] = 'FAIL'
 finally:
    summary['ended_at_utc'] = iso_now()
    evidence_path.parent.mkdir(parents=True, exist_ok=True)
    evidence_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + '\n', encoding='utf-8')
 print(f'K8S_SMOKE_EVIDENCE={evidence_path}')
 print(f'K8S_SMOKE_GATE={summary["gate_status"]}')
 if summary['gate_status'] != 'PASS':
    sys.exit(1)
 PY_SMOKE
--- a/scripts/normalize_polymarket_orderbooks.py
+++ b/scripts/normalize_polymarket_orderbooks.py
@ -0,0 +1,496 @@
 #!/usr/bin/env python3
 """Normalize raw Polymarket order-book snapshots from the sample collector.
 Checkpoint 5 scope: derive a bounded normalized gzip JSONL sample from the raw
 Checkpoint 4 sample. Raw files remain the source of truth; every normalized row
 keeps the raw file path and gzip JSONL line number.
 """
 from __future__ import annotations
 import argparse
 import datetime as dt
 import gzip
 import hashlib
 import json
 import sys
 from decimal import Decimal, InvalidOperation, getcontext
 from pathlib import Path
 from typing import Any
 NORMALIZER_NAME = "polymarket_orderbook_normalizer"
 NORMALIZER_VERSION = "0.1.0"
 SCHEMA_NAME = "normalized_orderbook_snapshot"
 SCHEMA_VERSION = 1
 DEFAULT_INPUT_MANIFEST = Path("data/manifests/orderbook_collector_sample_manifest.json")
 DEFAULT_OUTPUT_DIR = Path("data/normalized_sample")
 DEFAULT_MANIFEST_PATH = Path("data/manifests/orderbook_normalization_sample_manifest.json")
 CENT_OFFSETS = {
    "1c": Decimal("0.01"),
    "2c": Decimal("0.02"),
    "5c": Decimal("0.05"),
 }
 SECRET_PATTERNS = (
    "set-" "coo" "kie",
    "__cf" "_bm",
    "cf" "_bm",
    "author" "ization",
    "private" "_key",
    "api" "_secret",
    "poly" "_signature",
    "poly" "_passphrase",
    "poly" "_address",
    "bear" "er",
    "coo" "kie",
    "wallet" " material",
 )
 getcontext().prec = 50
 def utc_now() -> dt.datetime:
    return dt.datetime.now(dt.UTC)
 def iso_z(value: dt.datetime | None = None) -> str:
    value = value or utc_now()
    return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
 def compact_timestamp(value: dt.datetime | None = None) -> str:
    value = value or utc_now()
    return value.astimezone(dt.UTC).strftime("%Y%m%dT%H%M%SZ")
 def sha256_file(path: Path) -> str:
    digest = hashlib.sha256()
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            digest.update(chunk)
    return digest.hexdigest()
 def decimal_from_raw(value: Any, field_name: str) -> Decimal:
    if not isinstance(value, str):
        raise ValueError(f"{field_name} is not a string: {value!r}")
    try:
        parsed = Decimal(value)
    except InvalidOperation as exc:
        raise ValueError(f"{field_name} is not a decimal: {value!r}") from exc
    if not parsed.is_finite():
        raise ValueError(f"{field_name} is not finite: {value!r}")
    return parsed
 def decimal_to_json(value: Decimal | None) -> str | None:
    if value is None:
        return None
    if value == 0:
        return "0"
    return format(value.normalize(), "f")
 def load_json(path: Path) -> dict[str, Any]:
    with path.open("r", encoding="utf-8") as handle:
        data = json.load(handle)
    if not isinstance(data, dict):
        raise ValueError(f"{path} did not contain a JSON object")
    return data
 def resolve_repo_path(path_text: str) -> Path:
    path = Path(path_text)
    if path.is_absolute():
        return path
    return Path.cwd() / path
 def normalize_side(levels: Any, side_name: str) -> list[tuple[Decimal, Decimal]]:
    if not isinstance(levels, list):
        raise ValueError(f"raw.{side_name} is not a list")
    normalized: list[tuple[Decimal, Decimal]] = []
    for index, level in enumerate(levels):
        if not isinstance(level, dict):
            raise ValueError(f"raw.{side_name}[{index}] is not an object")
        price = decimal_from_raw(level.get("price"), f"raw.{side_name}[{index}].price")
        size = decimal_from_raw(level.get("size"), f"raw.{side_name}[{index}].size")
        if size < 0:
            raise ValueError(f"raw.{side_name}[{index}].size is negative")
        normalized.append((price, size))
    return normalized
 def sum_sizes(levels: list[tuple[Decimal, Decimal]]) -> Decimal:
    return sum((size for _, size in levels), Decimal("0"))
 def normalize_raw_row(raw_row: dict[str, Any], raw_file: str, raw_line_number: int) -> dict[str, Any]:
    raw_book = raw_row.get("raw")
    market = raw_row.get("market")
    collection = raw_row.get("collection")
    if not isinstance(raw_book, dict):
        raise ValueError("raw is not an object")
    if not isinstance(market, dict):
        raise ValueError("market is not an object")
    if not isinstance(collection, dict):
        raise ValueError("collection is not an object")
    bids = normalize_side(raw_book.get("bids"), "bids")
    asks = normalize_side(raw_book.get("asks"), "asks")
    best_bid = max((price for price, _ in bids), default=None)
    best_ask = min((price for price, _ in asks), default=None)
    spread = None
    midpoint = None
    if best_bid is not None and best_ask is not None:
        spread = best_ask - best_bid
        midpoint = (best_bid + best_ask) / Decimal("2")
    bid_depth_total = sum_sizes(bids)
    ask_depth_total = sum_sizes(asks)
    row: dict[str, Any] = {
        "schema_name": SCHEMA_NAME,
        "schema_version": SCHEMA_VERSION,
        "market_name": market.get("market_name"),
        "market_slug": market.get("market_slug"),
        "condition_id": market.get("condition_id"),
        "token_id": market.get("token_id"),
        "outcome": market.get("outcome"),
        "collected_at_utc": collection.get("collected_at_utc"),
        "best_bid": decimal_to_json(best_bid),
        "best_ask": decimal_to_json(best_ask),
        "spread": decimal_to_json(spread),
        "midpoint": decimal_to_json(midpoint),
        "bid_depth_total": decimal_to_json(bid_depth_total),
        "ask_depth_total": decimal_to_json(ask_depth_total),
        "raw_file": raw_file,
        "raw_line_number": raw_line_number,
    }
    for label, offset in CENT_OFFSETS.items():
        bid_depth = Decimal("0")
        if best_bid is not None:
            threshold = best_bid - offset
            bid_depth = sum((size for price, size in bids if price >= threshold), Decimal("0"))
        ask_depth = Decimal("0")
        if best_ask is not None:
            threshold = best_ask + offset
            ask_depth = sum((size for price, size in asks if price <= threshold), Decimal("0"))
        row[f"bid_depth_within_{label}"] = decimal_to_json(bid_depth)
        row[f"ask_depth_within_{label}"] = decimal_to_json(ask_depth)
    return row
 def summarize_output(path: Path, rows: int) -> dict[str, Any]:
    return {
        "path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path),
        "rows": rows,
        "bytes": path.stat().st_size,
        "sha256": sha256_file(path),
        "status": "valid",
    }
 def build_input_file_summary(manifest: dict[str, Any]) -> list[dict[str, Any]]:
    files = manifest.get("output_files")
    if not isinstance(files, list) or not files:
        raise ValueError("input manifest has no output_files")
    summaries: list[dict[str, Any]] = []
    for file_entry in files:
        if not isinstance(file_entry, dict):
            raise ValueError("input manifest output_files entry is not an object")
        path_text = file_entry.get("path")
        if not isinstance(path_text, str) or not path_text:
            raise ValueError("input manifest output_files entry lacks path")
        path = resolve_repo_path(path_text)
        if not path.exists():
            raise FileNotFoundError(path)
        actual_sha = sha256_file(path)
        expected_sha = file_entry.get("sha256")
        checksum_match = expected_sha == actual_sha
        summaries.append(
            {
                "path": path_text,
                "rows_expected": file_entry.get("rows"),
                "bytes": path.stat().st_size,
                "sha256": actual_sha,
                "input_manifest_sha256": expected_sha,
                "checksum_match": checksum_match,
                "status": "valid" if checksum_match else "invalid",
            }
        )
    return summaries
 def read_and_normalize(
    input_files: list[dict[str, Any]],
    output_path: Path,
 ) -> tuple[int, int, list[dict[str, Any]], dict[str, Any]]:
    raw_rows_read = 0
    normalized_rows_written = 0
    errors: list[dict[str, Any]] = []
    sanity = {
        "raw_file_refs_present": True,
        "raw_files_exist": True,
        "spread_non_negative": True,
        "midpoint_between_bid_ask": True,
        "depth_totals_non_negative": True,
        "outcomes_seen": [],
        "gzip_jsonl_parseable": True,
        "row_count_match": None,
    }
    outcomes_seen: set[str] = set()
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with gzip.open(output_path, "wt", encoding="utf-8", compresslevel=9) as output:
        for file_entry in input_files:
            raw_file = file_entry["path"]
            raw_path = resolve_repo_path(raw_file)
            if not raw_path.exists():
                sanity["raw_files_exist"] = False
                errors.append({"raw_file": raw_file, "error": "raw file missing"})
                continue
            with gzip.open(raw_path, "rt", encoding="utf-8") as raw_handle:
                for raw_line_number, line in enumerate(raw_handle, 1):
                    raw_rows_read += 1
                    try:
                        raw_row = json.loads(line)
                        normalized = normalize_raw_row(raw_row, raw_file, raw_line_number)
                        output.write(json.dumps(normalized, sort_keys=True, separators=(",", ":")) + "\n")
                        normalized_rows_written += 1
                        if not normalized.get("raw_file") or not normalized.get("raw_line_number"):
                            sanity["raw_file_refs_present"] = False
                        if not resolve_repo_path(str(normalized["raw_file"])).exists():
                            sanity["raw_files_exist"] = False
                        outcome = normalized.get("outcome")
                        if isinstance(outcome, str):
                            outcomes_seen.add(outcome)
                        best_bid = Decimal(normalized["best_bid"]) if normalized["best_bid"] is not None else None
                        best_ask = Decimal(normalized["best_ask"]) if normalized["best_ask"] is not None else None
                        spread = Decimal(normalized["spread"]) if normalized["spread"] is not None else None
                        midpoint = Decimal(normalized["midpoint"]) if normalized["midpoint"] is not None else None
                        if best_bid is not None and best_ask is not None:
                            if spread is None or spread < 0:
                                sanity["spread_non_negative"] = False
                            if midpoint is None or midpoint < best_bid or midpoint > best_ask:
                                sanity["midpoint_between_bid_ask"] = False
                        depth_fields = [
                            "bid_depth_total",
                            "ask_depth_total",
                            "bid_depth_within_1c",
                            "ask_depth_within_1c",
                            "bid_depth_within_2c",
                            "ask_depth_within_2c",
                            "bid_depth_within_5c",
                            "ask_depth_within_5c",
                        ]
                        for field in depth_fields:
                            if Decimal(normalized[field]) < 0:
                                sanity["depth_totals_non_negative"] = False
                    except Exception as exc:  # noqa: BLE001 - preserve row-level failure evidence.
                        errors.append(
                            {
                                "raw_file": raw_file,
                                "raw_line_number": raw_line_number,
                                "error": str(exc),
                            }
                        )
    sanity["outcomes_seen"] = sorted(outcomes_seen)
    sanity["has_up_and_down"] = {"Up", "Down"}.issubset(outcomes_seen)
    sanity["row_count_match"] = raw_rows_read == normalized_rows_written + len(errors)
    return raw_rows_read, normalized_rows_written, errors, sanity
 def validate_output_gzip_jsonl(path: Path) -> tuple[bool, int, list[str]]:
    errors: list[str] = []
    parsed_rows = 0
    try:
        with gzip.open(path, "rt", encoding="utf-8") as handle:
            for line_number, line in enumerate(handle, 1):
                json.loads(line)
                parsed_rows = line_number
    except Exception as exc:  # noqa: BLE001 - validation result belongs in manifest.
        errors.append(str(exc))
    return not errors, parsed_rows, errors
 def scan_for_secret_terms(paths: list[Path]) -> dict[str, Any]:
    matches: list[dict[str, Any]] = []
    lowered_patterns = tuple(pattern.lower() for pattern in SECRET_PATTERNS)
    for path in paths:
        if not path.exists():
            continue
        if path.suffix == ".gz":
            opener = gzip.open
        else:
            opener = open
        with opener(path, "rt", encoding="utf-8", errors="replace") as handle:  # type: ignore[arg-type]
            for line_number, line in enumerate(handle, 1):
                lower = line.lower()
                for pattern_index, pattern in enumerate(lowered_patterns, 1):
                    if pattern in lower:
                        matches.append(
                            {
                                "path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path),
                                "line_number": line_number,
                                "term_index": pattern_index,
                            }
                        )
                        break
    return {
        "passed": not matches,
        "checked_term_count": len(SECRET_PATTERNS),
        "matches": matches,
    }
 def parse_args(argv: list[str]) -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Normalize Checkpoint 4 raw Polymarket order-book snapshots.",
    )
    parser.add_argument(
        "--input-manifest",
        type=Path,
        default=DEFAULT_INPUT_MANIFEST,
        help=f"Raw collector manifest path. Default: {DEFAULT_INPUT_MANIFEST}",
    )
    parser.add_argument(
        "--output-dir",
        type=Path,
        default=DEFAULT_OUTPUT_DIR,
        help=f"Normalized sample base directory. Default: {DEFAULT_OUTPUT_DIR}",
    )
    parser.add_argument(
        "--manifest-path",
        type=Path,
        default=DEFAULT_MANIFEST_PATH,
        help=f"Normalization manifest path. Default: {DEFAULT_MANIFEST_PATH}",
    )
    return parser.parse_args(argv)
 def main(argv: list[str]) -> int:
    args = parse_args(argv)
    started = utc_now()
    input_manifest = load_json(args.input_manifest)
    input_files = build_input_file_summary(input_manifest)
    run_id = compact_timestamp(started)
    output_path = (
        args.output_dir
        / "polymarket"
        / "orderbooks"
        / run_id
        / f"polymarket_orderbooks_normalized_{run_id}.jsonl.gz"
    )
    raw_rows_read, normalized_rows_written, row_errors, sanity = read_and_normalize(input_files, output_path)
    gzip_ok, gzip_rows, gzip_errors = validate_output_gzip_jsonl(output_path)
    output_summary = summarize_output(output_path, normalized_rows_written)
    sanity.update(
        {
            "output_row_count_equals_raw_input_row_count": normalized_rows_written == raw_rows_read
            if not row_errors
            else False,
            "gzip_jsonl_decompresses_and_parses": gzip_ok,
            "gzip_jsonl_rows_parsed": gzip_rows,
            "gzip_jsonl_errors": gzip_errors,
            "manifest_checksum_matches_output": output_summary["sha256"] == sha256_file(output_path),
            "all_input_file_checksums_match": all(file_entry["checksum_match"] for file_entry in input_files),
        }
    )
    secret_scan = scan_for_secret_terms([Path(__file__), output_path])
    sanity["checkpoint5_secret_scan_passed"] = secret_scan["passed"]
    gate_checks = [
        normalized_rows_written == raw_rows_read,
        not row_errors,
        sanity["raw_file_refs_present"],
        sanity["raw_files_exist"],
        sanity["spread_non_negative"],
        sanity["midpoint_between_bid_ask"],
        sanity["depth_totals_non_negative"],
        sanity["has_up_and_down"],
        gzip_ok,
        sanity["manifest_checksum_matches_output"],
        secret_scan["passed"],
        all(file_entry["checksum_match"] for file_entry in input_files),
    ]
    gate_status = "PASS" if all(gate_checks) and normalized_rows_written > 0 else "FAIL"
    ended = utc_now()
    manifest = {
        "schema_name": "orderbook_normalization_sample_manifest",
        "schema_version": 1,
        "checkpoint_id": 5,
        "checkpoint_name": "Normalized Snapshot Extract",
        "normalizer": {
            "name": NORMALIZER_NAME,
            "version": NORMALIZER_VERSION,
        },
        "started_at_utc": iso_z(started),
        "ended_at_utc": iso_z(ended),
        "run_duration_seconds": round((ended - started).total_seconds(), 3),
        "command": "scripts/normalize_polymarket_orderbooks.py",
        "input_manifest": {
            "path": str(args.input_manifest),
            "sha256": sha256_file(args.input_manifest),
            "collector_manifest_schema_name": input_manifest.get("schema_name"),
            "collector_gate_status": input_manifest.get("gate_status"),
        },
        "input_files": input_files,
        "output_files": [output_summary],
        "raw_rows_read": raw_rows_read,
        "normalized_rows_written": normalized_rows_written,
        "skipped_rows": len(row_errors),
        "error_rows": row_errors,
        "numeric_encoding": "Exact decimal values are emitted as JSON strings; missing price-derived values are null.",
        "sanity_checks": sanity,
        "secret_scan": secret_scan,
        "warnings": [],
        "known_gaps": [
            "This is a derived sample extract only; raw gzip JSONL remains the source of truth.",
            "No upload, daemon runtime, systemd unit, dashboard, database, strategy, backtest, or trading behavior is included.",
            "The sample proves normalization logic on one bounded raw run, not long-run schema stability.",
        ],
        "fake_progress_risk": "A clean normalized sample can hide raw collection gaps and endpoint schema drift; every row is therefore traceable to raw_file and raw_line_number, and reliability remains gated on later soak testing.",
        "next_step": "Checkpoint 6 should package the raw collector for a VPS runtime, or the orchestrator can request review of this normalized sample first.",
        "gate_status": gate_status,
    }
    args.manifest_path.parent.mkdir(parents=True, exist_ok=True)
    args.manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
    print(
        json.dumps(
            {
                "gate_status": gate_status,
                "manifest_path": str(args.manifest_path),
                "output_path": str(output_path),
                "raw_rows_read": raw_rows_read,
                "normalized_rows_written": normalized_rows_written,
                "skipped_rows": len(row_errors),
                "sha256": output_summary["sha256"],
            },
            indent=2,
            sort_keys=True,
        )
    )
    return 0 if gate_status == "PASS" else 1
 if __name__ == "__main__":
    raise SystemExit(main(sys.argv[1:]))
--- a/scripts/probe_polymarket_public_sources.py
+++ b/scripts/probe_polymarket_public_sources.py
--- a/scripts/run_polymarket_24h_soak.sh
+++ b/scripts/run_polymarket_24h_soak.sh
@ -0,0 +1,362 @@
 #!/usr/bin/env bash
 set -uo pipefail
 APP_DIR="${ORDERBOOKS_APP_DIR:-$(pwd)}"
 PYTHON_BIN="${ORDERBOOKS_PYTHON:-python3}"
 RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-/usr/bin/rclone}"
 RCLONE_DEST_BASE="${ORDERBOOKS_RCLONE_DEST:-gdrive:orderbooks/polymarket/soak-test}"
 SOAK_DATE="${ORDERBOOKS_SOAK_DATE:-$(date -u +%F)}"
 SOAK_ID="${ORDERBOOKS_SOAK_ID:-soak_test_${SOAK_DATE}}"
 SOAK_SECONDS="${ORDERBOOKS_SOAK_SECONDS:-86400}"
 CYCLE_SECONDS="${ORDERBOOKS_SOAK_CYCLE_SECONDS:-300}"
 INTERVAL_SECONDS="${ORDERBOOKS_SOAK_INTERVAL_SECONDS:-30}"
 MARKET_LIMIT="${ORDERBOOKS_SOAK_MARKET_LIMIT:-2}"
 MARKET_END_SAFETY_SECONDS="${ORDERBOOKS_SOAK_MARKET_END_SAFETY_SECONDS:-420}"
 REQUEST_TIMEOUT_SECONDS="${ORDERBOOKS_SOAK_REQUEST_TIMEOUT_SECONDS:-15}"
 MAX_RETRIES="${ORDERBOOKS_SOAK_MAX_RETRIES:-2}"
 BACKOFF_SECONDS="${ORDERBOOKS_SOAK_BACKOFF_SECONDS:-2}"
 DISCOVERY_LIMIT="${ORDERBOOKS_SOAK_DISCOVERY_LIMIT:-100}"
 DISCOVERY_MAX_PAGES="${ORDERBOOKS_SOAK_DISCOVERY_MAX_PAGES:-3}"
 DISCOVERY_TIMEOUT="${ORDERBOOKS_SOAK_DISCOVERY_TIMEOUT:-15}"
 LOCAL_ROOT="${ORDERBOOKS_SOAK_LOCAL_ROOT:-data/soak_test/${SOAK_DATE}}"
 MANIFEST_ROOT="${ORDERBOOKS_SOAK_MANIFEST_ROOT:-data/manifests/${SOAK_ID}}"
 START_MANIFEST="${ORDERBOOKS_SOAK_START_MANIFEST:-data/manifests/${SOAK_ID}_start.json}"
 FINAL_MANIFEST="${ORDERBOOKS_SOAK_FINAL_MANIFEST:-data/manifests/${SOAK_ID}_final.json}"
 DISCOVERY_DIR="${LOCAL_ROOT}/discovery"
 LIVE_DIR="${LOCAL_ROOT}/live_sample"
 LOG_DIR="${LOCAL_ROOT}/logs"
 PID_FILE="${LOCAL_ROOT}/soak.pid"
 CYCLES_JSONL="${MANIFEST_ROOT}/cycles.jsonl"
 LOG_FILE="${LOG_DIR}/soak.log"
 REMOTE_DEST="${RCLONE_DEST_BASE%/}/${SOAK_DATE}"
 STOP_REQUESTED=0
 STOP_SIGNAL=""
 CURRENT_CHILD_PID=""
 CURRENT_PHASE="initializing"
 CURRENT_CYCLE_ID=""
 START_WRITTEN=0
 FINAL_WRITTEN=0
 cd "${APP_DIR}" || exit 2
 mkdir -p "${DISCOVERY_DIR}" "${LIVE_DIR}" "${LOG_DIR}" "${MANIFEST_ROOT}" "$(dirname "${START_MANIFEST}")" "$(dirname "${FINAL_MANIFEST}")"
 STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
 START_EPOCH="$(date -u +%s)"
 END_EPOCH="$((START_EPOCH + SOAK_SECONDS))"
 EXPECTED_COMPLETION_AT="$(date -u -d "@${END_EPOCH}" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || python3 - <<PY
 import datetime as dt
 print(dt.datetime.fromtimestamp(${END_EPOCH}, dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z"))
 PY
 )"
 safe_log() {
  printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" >> "${LOG_FILE}" 2>/dev/null || true
 }
 log() {
  printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" | tee -a "${LOG_FILE}"
 }
 handle_signal() {
  local signal_name="$1"
  STOP_REQUESTED=1
  STOP_SIGNAL="${signal_name}"
  safe_log "SIGNAL received=${signal_name} phase=${CURRENT_PHASE} cycle_id=${CURRENT_CYCLE_ID:-none}"
  if [[ -n "${CURRENT_CHILD_PID}" ]] && kill -0 "${CURRENT_CHILD_PID}" 2>/dev/null; then
    case "${signal_name}" in
      SIGINT) kill -INT "${CURRENT_CHILD_PID}" 2>/dev/null || true ;;
      SIGTERM) kill -TERM "${CURRENT_CHILD_PID}" 2>/dev/null || true ;;
      SIGHUP) kill -HUP "${CURRENT_CHILD_PID}" 2>/dev/null || true ;;
    esac
  fi
 }
 write_start_manifest() {
  local tmp_path="${START_MANIFEST}.tmp"
  python3 - "$tmp_path" "$START_MANIFEST" <<PY
 import json
 import os
 import pathlib
 tmp_path = pathlib.Path(os.sys.argv[1])
 final_path = pathlib.Path(os.sys.argv[2])
 manifest = {
    "schema_name": "soak_test_start_manifest",
    "schema_version": 1,
    "checkpoint_id": 8,
    "checkpoint_name": "24h Soak Test Plan",
    "status": "STARTED",
    "started_at_utc": "${STARTED_AT}",
    "expected_completion_at_utc": "${EXPECTED_COMPLETION_AT}",
    "soak_seconds": int("${SOAK_SECONDS}"),
    "cycle_seconds": int("${CYCLE_SECONDS}"),
    "pid": int("$$"),
    "pid_file": "${PID_FILE}",
    "log_file": "${LOG_FILE}",
    "local_root": "${LOCAL_ROOT}",
    "manifest_root": "${MANIFEST_ROOT}",
    "remote_dest": "${REMOTE_DEST}",
    "raw_output_dir": "${LIVE_DIR}",
    "discovery_dir": "${DISCOVERY_DIR}",
    "cycles_jsonl": "${CYCLES_JSONL}",
    "gate_status": "IN_PROGRESS",
    "production_ready": False,
    "notes": [
        "This is a real 24h soak start marker, not a completion report.",
        "Checkpoint 8 cannot pass until 24 real hours elapse and final metrics are validated.",
    ],
 }
 tmp_path.parent.mkdir(parents=True, exist_ok=True)
 tmp_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
 os.replace(tmp_path, final_path)
 PY
  START_WRITTEN=1
 }
 write_cycle_record() {
  local record="$1"
  printf '%s\n' "${record}" >> "${CYCLES_JSONL}"
 }
 write_final_manifest() {
  local final_status="$1"
  local gate_status="$2"
  local exit_reason="$3"
  local ended_at
  local tmp_path
  ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
  tmp_path="${FINAL_MANIFEST}.tmp"
  python3 - "$tmp_path" "$FINAL_MANIFEST" <<PY
 import json
 import os
 import pathlib
 tmp_path = pathlib.Path(os.sys.argv[1])
 final_path = pathlib.Path(os.sys.argv[2])
 cycles_path = pathlib.Path("${CYCLES_JSONL}")
 cycles = []
 if cycles_path.exists():
    cycles = [json.loads(line) for line in cycles_path.read_text(encoding="utf-8").splitlines() if line.strip()]
 manifest = {
    "schema_name": "soak_test_final_manifest",
    "schema_version": 1,
    "checkpoint_id": 8,
    "checkpoint_name": "24h Soak Test Plan",
    "status": "${final_status}",
    "gate_status": "${gate_status}",
    "exit_reason": "${exit_reason}",
    "started_at_utc": "${STARTED_AT}",
    "ended_at_utc": "${ended_at}",
    "expected_completion_at_utc": "${EXPECTED_COMPLETION_AT}",
    "soak_seconds": int("${SOAK_SECONDS}"),
    "cycle_seconds": int("${CYCLE_SECONDS}"),
    "cycles": cycles,
    "cycle_count": len(cycles),
    "ok_cycle_count": sum(1 for cycle in cycles if cycle.get("status") == "OK"),
    "error_cycle_count": sum(1 for cycle in cycles if cycle.get("status") == "ERROR"),
    "interrupted_cycle_count": sum(1 for cycle in cycles if cycle.get("status") == "INTERRUPTED"),
    "pid": int("$$"),
    "pid_file": "${PID_FILE}",
    "log_file": "${LOG_FILE}",
    "local_root": "${LOCAL_ROOT}",
    "manifest_root": "${MANIFEST_ROOT}",
    "remote_dest": "${REMOTE_DEST}",
    "stop_requested": bool(int("${STOP_REQUESTED}")),
    "stop_signal": "${STOP_SIGNAL}",
    "current_phase_at_exit": "${CURRENT_PHASE}",
    "current_cycle_id_at_exit": "${CURRENT_CYCLE_ID}",
    "production_ready": False,
    "notes": [
        "This marker is written by the soak controller on completion, interruption, or error.",
        "Checkpoint 8 cannot be PASS until 24 real hours elapse and final metrics are validated.",
    ],
 }
 tmp_path.parent.mkdir(parents=True, exist_ok=True)
 tmp_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
 os.replace(tmp_path, final_path)
 PY
  FINAL_WRITTEN=1
 }
 cleanup_on_exit() {
  local rc=$?
  if [[ "${START_WRITTEN}" -eq 1 && "${FINAL_WRITTEN}" -eq 0 ]]; then
    if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
      write_final_manifest "INTERRUPTED" "INTERRUPTED" "${STOP_SIGNAL:-signal}"
    elif [[ "${rc}" -ne 0 ]]; then
      write_final_manifest "ERROR" "ERROR" "exit_code_${rc}"
    else
      write_final_manifest "ERROR" "ERROR" "exited_without_final_marker"
    fi
  fi
  if [[ -f "${PID_FILE}" ]] && [[ "$(cat "${PID_FILE}" 2>/dev/null)" == "$$" ]]; then
    rm -f "${PID_FILE}"
  fi
  exit "${rc}"
 }
 run_logged() {
  "$@" >> "${LOG_FILE}" 2>&1 &
  CURRENT_CHILD_PID="$!"
  wait "${CURRENT_CHILD_PID}"
  local rc=$?
  if [[ "${STOP_REQUESTED}" -eq 1 ]] && kill -0 "${CURRENT_CHILD_PID}" 2>/dev/null; then
    wait "${CURRENT_CHILD_PID}"
    rc=$?
  fi
  CURRENT_CHILD_PID=""
  return "${rc}"
 }
 trap 'handle_signal SIGINT' INT
 trap 'handle_signal SIGTERM' TERM
 trap 'handle_signal SIGHUP' HUP
 trap cleanup_on_exit EXIT
 echo "$$" > "${PID_FILE}"
 write_start_manifest
 test -s "${START_MANIFEST}" || exit 3
 log "START soak_id=${SOAK_ID} pid=$$ expected_completion=${EXPECTED_COMPLETION_AT}"
 cycle_index=0
 error_seen=0
 while true; do
  now_epoch="$(date -u +%s)"
  remaining="$((END_EPOCH - now_epoch))"
  if [[ "${remaining}" -le 0 ]]; then
    break
  fi
  if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
    break
  fi
  if [[ "${remaining}" -lt 30 ]]; then
    log "SKIP final tiny remaining window seconds=${remaining}"
    break
  fi
  cycle_index="$((cycle_index + 1))"
  cycle_id="$(date -u +%Y%m%dT%H%M%SZ)"
  CURRENT_CYCLE_ID="${cycle_id}"
  run_seconds="${CYCLE_SECONDS}"
  if [[ "${remaining}" -lt "${run_seconds}" ]]; then
    run_seconds="${remaining}"
  fi
  discovery_json="${DISCOVERY_DIR}/polymarket_btc_markets_${cycle_id}.json"
  discovery_manifest="${DISCOVERY_DIR}/polymarket_btc_markets_manifest_${cycle_id}.json"
  discovery_markdown="${DISCOVERY_DIR}/polymarket_btc_markets_${cycle_id}.md"
  collector_manifest="${MANIFEST_ROOT}/collector_${cycle_id}.json"
  upload_manifest="${MANIFEST_ROOT}/upload_${cycle_id}.json"
  cycle_started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
  log "CYCLE ${cycle_index} start id=${cycle_id} run_seconds=${run_seconds}"
  discovery_exit=0
  CURRENT_PHASE="discovery"
  run_logged "${PYTHON_BIN}" scripts/discover_polymarket_btc_markets.py \
    --output-json "${discovery_json}" \
    --manifest "${discovery_manifest}" \
    --markdown "${discovery_markdown}" \
    --limit "${DISCOVERY_LIMIT}" \
    --max-pages "${DISCOVERY_MAX_PAGES}" \
    --timeout "${DISCOVERY_TIMEOUT}" || discovery_exit=$?
  collector_exit=0
  if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
    collector_exit=98
  elif [[ "${discovery_exit}" -eq 0 ]]; then
    CURRENT_PHASE="collector"
    run_logged "${PYTHON_BIN}" scripts/collect_polymarket_orderbooks.py \
      --config config/polymarket_collector.vps.example.yaml \
      --discovery-path "${discovery_json}" \
      --output-dir "${LIVE_DIR}" \
      --manifest-path "${collector_manifest}" \
      --market-limit "${MARKET_LIMIT}" \
      --interval-seconds "${INTERVAL_SECONDS}" \
      --duration-seconds "${run_seconds}" \
      --request-timeout-seconds "${REQUEST_TIMEOUT_SECONDS}" \
      --max-retries "${MAX_RETRIES}" \
      --backoff-seconds "${BACKOFF_SECONDS}" \
      --market-end-safety-seconds "${MARKET_END_SAFETY_SECONDS}" || collector_exit=$?
  else
    collector_exit=99
  fi
  upload_exit=0
  if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
    upload_exit=98
  elif [[ "${collector_exit}" -eq 0 ]]; then
    CURRENT_PHASE="upload"
    run_logged scripts/upload_archive_rclone.sh \
      --execute \
      --data-dir "${LOCAL_ROOT}" \
      --raw-dir "${LIVE_DIR}" \
      --source-manifest-dir "${MANIFEST_ROOT}" \
      --manifest-dir "${MANIFEST_ROOT}" \
      --manifest-path "${upload_manifest}" \
      --dest "${REMOTE_DEST}" \
      --min-age-seconds 0 \
      --rclone-bin "${RCLONE_BIN}" || upload_exit=$?
  else
    upload_exit=99
  fi
  cycle_ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
  if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
    cycle_status="INTERRUPTED"
  elif [[ "${discovery_exit}" -eq 0 && "${collector_exit}" -eq 0 && "${upload_exit}" -eq 0 ]]; then
    cycle_status="OK"
  else
    cycle_status="ERROR"
    error_seen=1
  fi
  record="$(python3 - <<PY
 import json
 print(json.dumps({
    "cycle_index": ${cycle_index},
    "cycle_id": "${cycle_id}",
    "started_at_utc": "${cycle_started_at}",
    "ended_at_utc": "${cycle_ended_at}",
    "run_seconds": int("${run_seconds}"),
    "discovery_manifest": "${discovery_manifest}",
    "collector_manifest": "${collector_manifest}",
    "upload_manifest": "${upload_manifest}",
    "discovery_exit": int("${discovery_exit}"),
    "collector_exit": int("${collector_exit}"),
    "upload_exit": int("${upload_exit}"),
    "status": "${cycle_status}",
    "stop_signal": "${STOP_SIGNAL}",
 }, sort_keys=True))
 PY
 )"
  write_cycle_record "${record}"
  log "CYCLE ${cycle_index} end id=${cycle_id} status=${cycle_status} discovery_exit=${discovery_exit} collector_exit=${collector_exit} upload_exit=${upload_exit}"
  CURRENT_PHASE="sleep"
  CURRENT_CYCLE_ID=""
  if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
    break
  fi
  sleep 5 &
  CURRENT_CHILD_PID="$!"
  wait "${CURRENT_CHILD_PID}" || true
  CURRENT_CHILD_PID=""
 done
 CURRENT_PHASE="finalizing"
 CURRENT_CYCLE_ID=""
 if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
  write_final_manifest "INTERRUPTED" "INTERRUPTED" "${STOP_SIGNAL:-signal}"
 elif [[ "${error_seen}" -eq 1 ]]; then
  write_final_manifest "ERROR" "ERROR" "cycle_error"
 else
  write_final_manifest "COMPLETED_NEEDS_REVIEW" "NEEDS_REVIEW" "elapsed"
 fi
 log "END soak_id=${SOAK_ID} final_manifest=${FINAL_MANIFEST} status_written=1"
--- a/scripts/run_polymarket_collector_cycle.sh
+++ b/scripts/run_polymarket_collector_cycle.sh
@ -0,0 +1,39 @@
 #!/usr/bin/env bash
 set -euo pipefail
 APP_DIR="${ORDERBOOKS_APP_DIR:-/opt/orderbooks}"
 PYTHON_BIN="${ORDERBOOKS_PYTHON:-${APP_DIR}/.venv/bin/python}"
 DATA_DIR="${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}"
 COLLECTOR_CONFIG="${ORDERBOOKS_COLLECTOR_CONFIG:-/etc/orderbooks/polymarket_collector.vps.yaml}"
 DISCOVERY_DIR="${ORDERBOOKS_DISCOVERY_DIR:-${DATA_DIR}/discovery}"
 OUTPUT_DIR="${ORDERBOOKS_OUTPUT_DIR:-${DATA_DIR}/raw_orderbooks}"
 MANIFEST_DIR="${ORDERBOOKS_MANIFEST_DIR:-${DATA_DIR}/manifests}"
 DISCOVERY_JSON="${ORDERBOOKS_DISCOVERY_JSON:-${DISCOVERY_DIR}/polymarket_btc_markets_latest.json}"
 DISCOVERY_MANIFEST="${ORDERBOOKS_DISCOVERY_MANIFEST:-${DISCOVERY_DIR}/polymarket_btc_markets_manifest.json}"
 DISCOVERY_MARKDOWN="${ORDERBOOKS_DISCOVERY_MARKDOWN:-${DISCOVERY_DIR}/polymarket_btc_markets.md}"
 DISCOVERY_LIMIT="${ORDERBOOKS_DISCOVERY_LIMIT:-100}"
 DISCOVERY_MAX_PAGES="${ORDERBOOKS_DISCOVERY_MAX_PAGES:-3}"
 DISCOVERY_TIMEOUT="${ORDERBOOKS_DISCOVERY_TIMEOUT:-15}"
 cycle_id="$(date -u +%Y%m%dT%H%M%SZ)"
 COLLECTOR_MANIFEST="${ORDERBOOKS_COLLECTOR_MANIFEST:-${MANIFEST_DIR}/polymarket_orderbook_collector_${cycle_id}.json}"
 mkdir -p "${DISCOVERY_DIR}" "${OUTPUT_DIR}" "${MANIFEST_DIR}"
 cd "${APP_DIR}"
 "${PYTHON_BIN}" scripts/discover_polymarket_btc_markets.py \
  --output-json "${DISCOVERY_JSON}" \
  --manifest "${DISCOVERY_MANIFEST}" \
  --markdown "${DISCOVERY_MARKDOWN}" \
  --limit "${DISCOVERY_LIMIT}" \
  --max-pages "${DISCOVERY_MAX_PAGES}" \
  --timeout "${DISCOVERY_TIMEOUT}"
 exec "${PYTHON_BIN}" scripts/collect_polymarket_orderbooks.py \
  --config "${COLLECTOR_CONFIG}" \
  --discovery-path "${DISCOVERY_JSON}" \
  --output-dir "${OUTPUT_DIR}" \
  --manifest-path "${COLLECTOR_MANIFEST}"
--- a/scripts/run_polymarket_collector_loop.sh
+++ b/scripts/run_polymarket_collector_loop.sh
@ -0,0 +1,90 @@
 #!/usr/bin/env bash
 set -uo pipefail
 APP_DIR="${ORDERBOOKS_APP_DIR:-/app}"
 MANIFEST_DIR="${ORDERBOOKS_MANIFEST_DIR:-${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}/manifests}"
 LOOP_SLEEP_SECONDS="${ORDERBOOKS_LOOP_SLEEP_SECONDS:-15}"
 STOP_REQUESTED=0
 CHILD_PID=""
 utc_compact() {
  date -u +%Y%m%dT%H%M%SZ
 }
 utc_iso() {
  date -u +%Y-%m-%dT%H:%M:%SZ
 }
 write_loop_event() {
  local status="$1"
  local exit_code="$2"
  local message="$3"
  local path="${MANIFEST_DIR%/}/collector_loop_$(utc_compact).json"
  mkdir -p "${MANIFEST_DIR}"
  PYTHONDONTWRITEBYTECODE=1 python3 - "$path" "$status" "$exit_code" "$message" <<'PY_LOOP_EVENT'
 import json
 import sys
 import datetime as dt
 from pathlib import Path
 path = Path(sys.argv[1])
 status = sys.argv[2]
 exit_code = int(sys.argv[3])
 message = sys.argv[4]
 now = dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z')
 path.write_text(json.dumps({
    'schema_name': 'collector_loop_event',
    'schema_version': 1,
    'written_at_utc': now,
    'status': status,
    'exit_code': exit_code,
    'message': message,
 }, indent=2, sort_keys=True) + '\n', encoding='utf-8')
 PY_LOOP_EVENT
 }
 request_stop() {
  STOP_REQUESTED=1
  if [[ -n "${CHILD_PID}" ]] && kill -0 "${CHILD_PID}" >/dev/null 2>&1; then
    kill -TERM "${CHILD_PID}" >/dev/null 2>&1 || true
  fi
 }
 trap request_stop INT TERM
 mkdir -p "${MANIFEST_DIR}"
 cd "${APP_DIR}" || exit 1
 echo "collector loop started at $(utc_iso)"
 while [[ "${STOP_REQUESTED}" -eq 0 ]]; do
  cycle_started="$(utc_iso)"
  echo "collector cycle starting at ${cycle_started}"
  /bin/bash scripts/run_polymarket_collector_cycle.sh &
  CHILD_PID="$!"
  wait "${CHILD_PID}"
  cycle_exit="$?"
  CHILD_PID=""
  if [[ "${STOP_REQUESTED}" -ne 0 ]]; then
    write_loop_event "INTERRUPTED" "${cycle_exit}" "collector loop received stop request during or after cycle"
    break
  fi
  if [[ "${cycle_exit}" -ne 0 ]]; then
    write_loop_event "CYCLE_FAILED" "${cycle_exit}" "collector cycle exited nonzero; loop will continue after sleep"
    echo "collector cycle failed with exit ${cycle_exit}; continuing after ${LOOP_SLEEP_SECONDS}s" >&2
  else
    echo "collector cycle completed at $(utc_iso)"
  fi
  for ((i = 0; i < LOOP_SLEEP_SECONDS; i++)); do
    if [[ "${STOP_REQUESTED}" -ne 0 ]]; then
      break
    fi
    sleep 1
  done
 done
 echo "collector loop stopped at $(utc_iso)"
--- a/scripts/upload_archive_rclone.sh
+++ b/scripts/upload_archive_rclone.sh
@ -0,0 +1,462 @@
 #!/usr/bin/env bash
 set -uo pipefail
 SCRIPT_NAME="orderbooks_rclone_uploader"
 SCRIPT_VERSION="0.1.0"
 MODE="dry-run"
 CLEANUP_AFTER_VERIFY=0
 DATA_DIR="${ORDERBOOKS_UPLOAD_DATA_DIR:-${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}}"
 RAW_DIR="${ORDERBOOKS_UPLOAD_RAW_DIR:-}"
 SOURCE_MANIFEST_DIR="${ORDERBOOKS_UPLOAD_SOURCE_MANIFEST_DIR:-}"
 MANIFEST_DIR="${ORDERBOOKS_UPLOAD_MANIFEST_DIR:-}"
 MANIFEST_PATH="${ORDERBOOKS_UPLOAD_MANIFEST_PATH:-}"
 DEST="${ORDERBOOKS_RCLONE_DEST:-}"
 RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-rclone}"
 MIN_AGE_SECONDS="${ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS:-600}"
 RETENTION_DAYS="${ORDERBOOKS_UPLOAD_RETENTION_DAYS:-7}"
 TRANSFERS="${ORDERBOOKS_RCLONE_TRANSFERS:-4}"
 CHECKERS="${ORDERBOOKS_RCLONE_CHECKERS:-8}"
 usage() {
  cat <<'EOF'
 Usage: scripts/upload_archive_rclone.sh [options]
 Uploads closed raw collector archive files and manifests with rclone.
 Default mode is dry-run. Real upload requires --execute and a destination.
 Options:
  --dry-run                  Plan and run rclone copy with --dry-run (default).
  --execute                  Run real rclone copy and rclone check.
  --cleanup-after-verify     Delete uploaded local files older than retention only after verification.
  --data-dir DIR             Base data directory. Default: /var/lib/orderbooks.
  --raw-dir DIR              Raw collector output directory. Default: DATA_DIR/raw_orderbooks.
  --source-manifest-dir DIR  Source collector manifest directory. Default: DATA_DIR/manifests.
  --manifest-dir DIR         Upload manifest output directory. Default: DATA_DIR/manifests.
  --manifest-path PATH       Exact upload manifest path.
  --dest REMOTE:PATH         rclone destination. Or set ORDERBOOKS_RCLONE_DEST.
  --min-age-seconds N        Skip files modified within N seconds. Default: 600.
  --retention-days N         Keep at least N days locally. Default: 7.
  --rclone-bin PATH          rclone binary path. Default: rclone.
  --help                     Show this help.
 EOF
 }
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --dry-run)
      MODE="dry-run"
      shift
      ;;
    --execute)
      MODE="execute"
      shift
      ;;
    --cleanup-after-verify)
      CLEANUP_AFTER_VERIFY=1
      shift
      ;;
    --data-dir)
      DATA_DIR="$2"
      shift 2
      ;;
    --raw-dir)
      RAW_DIR="$2"
      shift 2
      ;;
    --source-manifest-dir)
      SOURCE_MANIFEST_DIR="$2"
      shift 2
      ;;
    --manifest-dir)
      MANIFEST_DIR="$2"
      shift 2
      ;;
    --manifest-path)
      MANIFEST_PATH="$2"
      shift 2
      ;;
    --dest)
      DEST="$2"
      shift 2
      ;;
    --min-age-seconds)
      MIN_AGE_SECONDS="$2"
      shift 2
      ;;
    --retention-days)
      RETENTION_DAYS="$2"
      shift 2
      ;;
    --rclone-bin)
      RCLONE_BIN="$2"
      shift 2
      ;;
    --help)
      usage
      exit 0
      ;;
    *)
      echo "Unknown argument: $1" >&2
      usage >&2
      exit 2
      ;;
  esac
 done
 if [[ -z "${RAW_DIR}" ]]; then
  RAW_DIR="${DATA_DIR%/}/raw_orderbooks"
 fi
 if [[ -z "${SOURCE_MANIFEST_DIR}" ]]; then
  SOURCE_MANIFEST_DIR="${DATA_DIR%/}/manifests"
 fi
 if [[ -z "${MANIFEST_DIR}" ]]; then
  MANIFEST_DIR="${DATA_DIR%/}/manifests"
 fi
 STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
 RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
 if [[ -z "${MANIFEST_PATH}" ]]; then
  MANIFEST_PATH="${MANIFEST_DIR%/}/upload_archive_${RUN_ID}.json"
 fi
 TMPDIR="$(mktemp -d)"
 trap 'rm -rf "${TMPDIR}"' EXIT
 PLAN_PATH="${TMPDIR}/plan.json"
 RCLONE_COPY_LOG="${TMPDIR}/rclone_copy.log"
 RCLONE_CHECK_LOG="${TMPDIR}/rclone_check.log"
 CLEANUP_PATH="${TMPDIR}/cleanup.json"
 STAGING_DIR="${TMPDIR}/stage"
 mkdir -p "$(dirname "${MANIFEST_PATH}")" "${STAGING_DIR}"
 python3 - "$DATA_DIR" "$RAW_DIR" "$SOURCE_MANIFEST_DIR" "$MANIFEST_PATH" "$MIN_AGE_SECONDS" "$STAGING_DIR" "$PLAN_PATH" <<'PY'
 import datetime as dt
 import hashlib
 import json
 import os
 import shutil
 import sys
 from pathlib import Path
 data_dir = Path(sys.argv[1])
 raw_dir = Path(sys.argv[2])
 source_manifest_dir = Path(sys.argv[3])
 manifest_path = Path(sys.argv[4]).resolve()
 min_age_seconds = int(sys.argv[5])
 staging_dir = Path(sys.argv[6])
 plan_path = Path(sys.argv[7])
 now = dt.datetime.now(dt.UTC)
 def iso_z_from_ts(ts: float) -> str:
    return dt.datetime.fromtimestamp(ts, dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
 def sha256_file(path: Path) -> str:
    digest = hashlib.sha256()
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            digest.update(chunk)
    return digest.hexdigest()
 def rel_for(path: Path) -> str:
    resolved = path.resolve()
    try:
        return resolved.relative_to(data_dir.resolve()).as_posix()
    except ValueError:
        return resolved.name
 def iter_files(root: Path):
    if not root.exists():
        return
    for path in sorted(root.rglob("*")):
        if path.is_file():
            yield path
 selected = []
 skipped = []
 warnings = []
 seen = set()
 for root, kind in [(raw_dir, "raw"), (source_manifest_dir, "manifest")]:
    if not root.exists():
        warnings.append(f"{kind} source directory does not exist: {root}")
        continue
    for path in iter_files(root):
        resolved = path.resolve()
        if resolved in seen:
            continue
        seen.add(resolved)
        rel = rel_for(path)
        stat = path.stat()
        age_seconds = max(0, int(now.timestamp() - stat.st_mtime))
        base = {
            "local_path": str(path),
            "relative_path": rel,
            "kind": kind,
            "bytes": stat.st_size,
            "mtime_utc": iso_z_from_ts(stat.st_mtime),
            "age_seconds": age_seconds,
        }
        if resolved == manifest_path:
            skipped.append({**base, "reason": "current_upload_manifest"})
            continue
        if age_seconds < min_age_seconds:
            skipped.append({**base, "reason": "modified_within_min_age_seconds"})
            continue
        checksum = sha256_file(path)
        staged_path = staging_dir / rel
        staged_path.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(path, staged_path)
        selected.append({**base, "sha256": checksum, "staged_path": str(staged_path)})
 plan = {
    "selected_files": selected,
    "skipped_files": skipped,
    "warnings": warnings,
 }
 plan_path.write_text(json.dumps(plan, indent=2, sort_keys=True) + "\n", encoding="utf-8")
 PY
 RCLONE_AVAILABLE=0
 RCLONE_VERSION=""
 if command -v "${RCLONE_BIN}" >/dev/null 2>&1; then
  RCLONE_AVAILABLE=1
  RCLONE_VERSION="$("${RCLONE_BIN}" version 2>/dev/null | head -n 1 || true)"
 fi
 DEST_CONFIGURED=0
 if [[ -n "${DEST}" ]]; then
  DEST_CONFIGURED=1
 fi
 COPY_EXIT_CODE=""
 CHECK_EXIT_CODE=""
 COPY_ATTEMPTED=0
 CHECK_ATTEMPTED=0
 OPERATION_STATUS="PLANNED"
 GATE_STATUS="BLOCKED_REAL_UPLOAD"
 if [[ "${DEST_CONFIGURED}" -eq 0 ]]; then
  OPERATION_STATUS="BLOCKED_DEST_MISSING"
 elif [[ "${RCLONE_AVAILABLE}" -eq 0 ]]; then
  OPERATION_STATUS="BLOCKED_RCLONE_UNAVAILABLE"
 else
  COPY_ATTEMPTED=1
  copy_args=(copy "${STAGING_DIR}/" "${DEST%/}/" --checksum --transfers "${TRANSFERS}" --checkers "${CHECKERS}")
  if [[ "${MODE}" == "dry-run" ]]; then
    copy_args+=(--dry-run)
  fi
  "${RCLONE_BIN}" "${copy_args[@]}" >"${RCLONE_COPY_LOG}" 2>&1
  COPY_EXIT_CODE=$?
  if [[ "${COPY_EXIT_CODE}" -eq 0 && "${MODE}" == "dry-run" ]]; then
    OPERATION_STATUS="DRY_RUN_PASS"
  elif [[ "${COPY_EXIT_CODE}" -eq 0 ]]; then
    CHECK_ATTEMPTED=1
    "${RCLONE_BIN}" check "${STAGING_DIR}/" "${DEST%/}/" --one-way --checksum >"${RCLONE_CHECK_LOG}" 2>&1
    CHECK_EXIT_CODE=$?
    if [[ "${CHECK_EXIT_CODE}" -eq 0 ]]; then
      OPERATION_STATUS="UPLOAD_VERIFIED"
      GATE_STATUS="PASS"
    else
      OPERATION_STATUS="VERIFY_FAILED"
      GATE_STATUS="FAIL"
    fi
  else
    OPERATION_STATUS="COPY_FAILED"
    GATE_STATUS="FAIL"
  fi
 fi
 python3 - "$PLAN_PATH" "$CLEANUP_PATH" "$MODE" "$CLEANUP_AFTER_VERIFY" "$RETENTION_DAYS" "$OPERATION_STATUS" "$GATE_STATUS" <<'PY'
 import datetime as dt
 import json
 import sys
 from pathlib import Path
 plan_path = Path(sys.argv[1])
 cleanup_path = Path(sys.argv[2])
 mode = sys.argv[3]
 cleanup_after_verify = sys.argv[4] == "1"
 retention_days = int(sys.argv[5])
 operation_status = sys.argv[6]
 gate_status = sys.argv[7]
 plan = json.loads(plan_path.read_text())
 now = dt.datetime.now(dt.UTC)
 cutoff = now - dt.timedelta(days=retention_days)
 retained = []
 deleted = []
 if mode == "execute" and cleanup_after_verify and operation_status == "UPLOAD_VERIFIED":
    for item in plan["selected_files"]:
        path = Path(item["local_path"])
        mtime = dt.datetime.fromtimestamp(path.stat().st_mtime, dt.UTC) if path.exists() else now
        if mtime < cutoff and path.exists():
            path.unlink()
            deleted.append({**item, "deleted_at_utc": now.replace(microsecond=0).isoformat().replace("+00:00", "Z")})
        else:
            retained.append({**item, "reason": "within_retention_window" if mtime >= cutoff else "missing_before_cleanup"})
 else:
    reason = "cleanup_not_requested"
    if mode != "execute":
        reason = "dry_run"
    elif operation_status != "UPLOAD_VERIFIED":
        reason = "not_verified"
    for item in plan["selected_files"]:
        retained.append({**item, "reason": reason})
 cleanup_path.write_text(
    json.dumps({"retained_local_files": retained, "deleted_local_files": deleted}, indent=2, sort_keys=True) + "\n",
    encoding="utf-8",
 )
 PY
 ENDED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
 export SCRIPT_NAME SCRIPT_VERSION STARTED_AT ENDED_AT
 export MODE OPERATION_STATUS GATE_STATUS
 export RCLONE_BIN RCLONE_AVAILABLE RCLONE_VERSION DEST
 export COPY_ATTEMPTED CHECK_ATTEMPTED COPY_EXIT_CODE CHECK_EXIT_CODE
 export DATA_DIR RAW_DIR SOURCE_MANIFEST_DIR MIN_AGE_SECONDS RETENTION_DAYS CLEANUP_AFTER_VERIFY
 python3 - "$PLAN_PATH" "$CLEANUP_PATH" "$MANIFEST_PATH" <<'PY'
 import json
 import os
 import sys
 from pathlib import Path
 plan = json.loads(Path(sys.argv[1]).read_text())
 cleanup = json.loads(Path(sys.argv[2]).read_text())
 manifest_path = Path(sys.argv[3])
 mode = os.environ["MODE"]
 operation_status = os.environ["OPERATION_STATUS"]
 gate_status = os.environ["GATE_STATUS"]
 copy_attempted = os.environ["COPY_ATTEMPTED"] == "1"
 check_attempted = os.environ["CHECK_ATTEMPTED"] == "1"
 copy_exit_code = os.environ["COPY_EXIT_CODE"]
 check_exit_code = os.environ["CHECK_EXIT_CODE"]
 dest = os.environ["DEST"]
 def public_item(item):
    public = dict(item)
    public.pop("staged_path", None)
    return public
 selected = [public_item(item) for item in plan["selected_files"]]
 skipped = [public_item(item) for item in plan["skipped_files"]]
 retained_local = [public_item(item) for item in cleanup["retained_local_files"]]
 deleted_local = [public_item(item) for item in cleanup["deleted_local_files"]]
 attempted_files = selected if copy_attempted else []
 uploaded_files = selected if mode == "execute" and operation_status in {"UPLOAD_VERIFIED", "VERIFY_FAILED"} else []
 verified_files = selected if mode == "execute" and operation_status == "UPLOAD_VERIFIED" else []
 dry_run_files = selected if mode == "dry-run" and operation_status == "DRY_RUN_PASS" else []
 manifest = {
    "schema_name": "upload_archive_manifest",
    "schema_version": 1,
    "checkpoint_id": 7,
    "checkpoint_name": "Google Drive Offload",
    "uploader": {
        "name": os.environ["SCRIPT_NAME"],
        "version": os.environ["SCRIPT_VERSION"],
    },
    "started_at_utc": os.environ["STARTED_AT"],
    "ended_at_utc": os.environ["ENDED_AT"],
    "command_mode": mode,
    "operation_status": operation_status,
    "gate_status": gate_status,
    "rclone": {
        "binary": os.environ["RCLONE_BIN"],
        "available": os.environ["RCLONE_AVAILABLE"] == "1",
        "version": os.environ["RCLONE_VERSION"],
        "destination_configured": bool(dest),
        "destination": dest if dest else None,
        "copy_attempted": copy_attempted,
        "copy_exit_code": int(copy_exit_code) if copy_exit_code else None,
        "check_attempted": check_attempted,
        "check_exit_code": int(check_exit_code) if check_exit_code else None,
    },
    "config": {
        "data_dir": os.environ["DATA_DIR"],
        "raw_dir": os.environ["RAW_DIR"],
        "source_manifest_dir": os.environ["SOURCE_MANIFEST_DIR"],
        "manifest_path": str(manifest_path),
        "min_age_seconds": int(os.environ["MIN_AGE_SECONDS"]),
        "retention_days": int(os.environ["RETENTION_DAYS"]),
        "cleanup_after_verify": os.environ["CLEANUP_AFTER_VERIFY"] == "1",
    },
    "planned_files": selected,
    "attempted_files": attempted_files,
    "dry_run_files": dry_run_files,
    "uploaded_files": uploaded_files,
    "verified_files": verified_files,
    "skipped_open_or_recent_files": [
        item for item in skipped if item.get("reason") == "modified_within_min_age_seconds"
    ],
    "skipped_files": skipped,
    "retained_local_files": retained_local,
    "deleted_local_files": deleted_local,
    "counts": {
        "planned": len(selected),
        "attempted": len(attempted_files),
        "dry_run": len(dry_run_files),
        "uploaded": len(uploaded_files),
        "verified": len(verified_files),
        "skipped": len(skipped),
        "retained_local": len(retained_local),
        "deleted_local": len(deleted_local),
    },
    "warnings": plan["warnings"],
    "known_gaps": [
        "A dry-run does not prove remote write access.",
        "Real upload requires a configured rclone remote outside the repository.",
        "Local files are retained unless --cleanup-after-verify is used after successful verification.",
    ],
 }
 if operation_status == "BLOCKED_RCLONE_UNAVAILABLE":
    manifest["warnings"].append("rclone binary was not available; copy and verification were not attempted.")
 if operation_status == "BLOCKED_DEST_MISSING":
    manifest["warnings"].append("No rclone destination was configured; set --dest or ORDERBOOKS_RCLONE_DEST.")
 if mode == "dry-run":
    manifest["warnings"].append("Dry-run mode does not perform a real upload; checkpoint real-upload gate remains blocked.")
 manifest_path.parent.mkdir(parents=True, exist_ok=True)
 manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
 print(
    json.dumps(
        {
            "gate_status": gate_status,
            "operation_status": operation_status,
            "manifest_path": str(manifest_path),
            "planned_files": len(selected),
            "attempted_files": len(attempted_files),
            "uploaded_files": len(uploaded_files),
            "verified_files": len(verified_files),
            "skipped_files": len(plan["skipped_files"]),
        },
        indent=2,
        sort_keys=True,
    )
 )
 PY
 case "${OPERATION_STATUS}" in
  UPLOAD_VERIFIED|DRY_RUN_PASS)
    exit 0
    ;;
  BLOCKED_DEST_MISSING)
    echo "No rclone destination configured. Set --dest or ORDERBOOKS_RCLONE_DEST." >&2
    exit 2
    ;;
  BLOCKED_RCLONE_UNAVAILABLE)
    echo "rclone is not available. Install rclone before running dry-run or execute mode." >&2
    exit 3
    ;;
  *)
    echo "Upload operation failed with status: ${OPERATION_STATUS}" >&2
    exit 1
    ;;
 esac
--- a/scripts/vps_preflight_check.sh
+++ b/scripts/vps_preflight_check.sh
@ -0,0 +1,285 @@
 #!/usr/bin/env bash
 set -uo pipefail
 APP_DIR="$(pwd)"
 PYTHON_BIN="${ORDERBOOKS_PYTHON:-python3}"
 RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-rclone}"
 RCLONE_REMOTE="${ORDERBOOKS_RCLONE_DEST:-}"
 DATA_DIR=""
 MANIFEST_DIR=""
 LOG_DIR=""
 MIN_FREE_GIB="${ORDERBOOKS_PREFLIGHT_MIN_FREE_GIB:-5}"
 REMOTE_TIMEOUT_SECONDS="${ORDERBOOKS_PREFLIGHT_REMOTE_TIMEOUT_SECONDS:-30}"
 FAILURES=0
 WARNINGS=0
 usage() {
  cat <<'EOF'
 Usage: scripts/vps_preflight_check.sh [options]
 Read-only VPS cutover preflight for the Polymarket order-book collector.
 Default behavior checks the repository, local tooling, unit syntax, disk space,
 and rclone availability. It does not print rclone config and does not require
 secrets.
 Options:
  --app-dir DIR              Repository checkout path. Default: current directory.
  --python-bin PATH          Python interpreter. Default: ORDERBOOKS_PYTHON or python3.
  --rclone-bin PATH          rclone binary. Default: ORDERBOOKS_RCLONE_BIN or rclone.
  --rclone-remote REMOTE     Optional remote/path to check read-only, e.g. gdrive:orderbooks/polymarket.
  --data-dir DIR             Optional target data directory to create/check writable.
  --manifest-dir DIR         Optional target manifest directory to create/check writable.
  --log-dir DIR              Optional target log directory to create/check writable.
  --min-free-gib N           Minimum free GiB for checked filesystems. Default: 5.
  --remote-timeout-seconds N Timeout for rclone remote read check. Default: 30.
  --help                     Show this help.
 Directory options intentionally create missing directories before checking
 writability. Omit them for a repo-only read-only check.
 EOF
 }
 log_pass() { printf 'PASS %s\n' "$*"; }
 log_info() { printf 'INFO %s\n' "$*"; }
 log_warn() { WARNINGS=$((WARNINGS + 1)); printf 'WARN %s\n' "$*"; }
 log_fail() { FAILURES=$((FAILURES + 1)); printf 'FAIL %s\n' "$*"; }
 run_quiet() { "$@" >/dev/null 2>&1; }
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --app-dir) APP_DIR="$2"; shift 2 ;;
    --python-bin) PYTHON_BIN="$2"; shift 2 ;;
    --rclone-bin) RCLONE_BIN="$2"; shift 2 ;;
    --rclone-remote) RCLONE_REMOTE="$2"; shift 2 ;;
    --data-dir) DATA_DIR="$2"; shift 2 ;;
    --manifest-dir) MANIFEST_DIR="$2"; shift 2 ;;
    --log-dir) LOG_DIR="$2"; shift 2 ;;
    --min-free-gib) MIN_FREE_GIB="$2"; shift 2 ;;
    --remote-timeout-seconds) REMOTE_TIMEOUT_SECONDS="$2"; shift 2 ;;
    --help) usage; exit 0 ;;
    *) log_fail "unknown argument: $1"; usage >&2; exit 2 ;;
  esac
 done
 APP_DIR="${APP_DIR%/}"
 if [[ ! -d "${APP_DIR}" ]]; then
  log_fail "app directory does not exist: ${APP_DIR}"
  printf 'SUMMARY failures=%s warnings=%s\n' "${FAILURES}" "${WARNINGS}"
  exit 1
 fi
 cd "${APP_DIR}" || {
  log_fail "could not cd to app directory: ${APP_DIR}"
  printf 'SUMMARY failures=%s warnings=%s\n' "${FAILURES}" "${WARNINGS}"
  exit 1
 }
 check_python() {
  if command -v "${PYTHON_BIN}" >/dev/null 2>&1; then
    version="$("${PYTHON_BIN}" --version 2>&1 || true)"
    log_pass "python available: ${PYTHON_BIN} (${version})"
  else
    log_fail "python not found: ${PYTHON_BIN}"
  fi
 }
 check_required_files() {
  local missing=0 file
  local required=(
    "scripts/discover_polymarket_btc_markets.py"
    "scripts/collect_polymarket_orderbooks.py"
    "scripts/normalize_polymarket_orderbooks.py"
    "scripts/run_polymarket_collector_cycle.sh"
    "scripts/upload_archive_rclone.sh"
    "scripts/vps_runtime_smoke_check.sh"
    "config/polymarket_collector.vps.example.yaml"
    "docs/VPS_DEPLOYMENT.md"
    "docs/GOOGLE_DRIVE_OFFLOAD.md"
    "systemd/polymarket-orderbook-collector.service"
    "systemd/polymarket-orderbook-uploader.service"
    "systemd/polymarket-orderbook-uploader.timer"
  )
  for file in "${required[@]}"; do
    if [[ -f "${file}" ]]; then
      log_pass "required file exists: ${file}"
    else
      missing=1
      log_fail "required file missing: ${file}"
    fi
  done
  return "${missing}"
 }
 check_python_compile() {
  if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then
    log_fail "cannot compile Python scripts because Python is missing"
    return
  fi
  if run_quiet "${PYTHON_BIN}" - <<'PY'
 from pathlib import Path
 paths = [
    Path("scripts/discover_polymarket_btc_markets.py"),
    Path("scripts/collect_polymarket_orderbooks.py"),
    Path("scripts/normalize_polymarket_orderbooks.py"),
 ]
 for path in paths:
    source = path.read_text(encoding="utf-8")
    compile(source, str(path), "exec")
 PY
  then
    log_pass "collector/discovery/normalization Python scripts compile without bytecode writes"
  else
    log_fail "Python no-bytecode compile check failed"
  fi
 }
 check_shell_syntax() {
  local failed=0 script
  for script in scripts/*.sh; do
    [[ -f "${script}" ]] || continue
    if bash -n "${script}" >/dev/null 2>&1; then
      log_pass "bash syntax ok: ${script}"
    else
      failed=1
      log_fail "bash syntax failed: ${script}"
    fi
  done
  return "${failed}"
 }
 check_systemd_units() {
  local units=(
    "systemd/polymarket-orderbook-collector.service"
    "systemd/polymarket-orderbook-uploader.service"
    "systemd/polymarket-orderbook-uploader.timer"
  )
  if command -v systemd-analyze >/dev/null 2>&1; then
    if systemd-analyze verify "${units[@]}" >/dev/null 2>&1; then
      log_pass "systemd units parse with systemd-analyze"
    else
      log_fail "systemd-analyze verify failed for one or more units"
    fi
  else
    log_warn "systemd-analyze unavailable; skipped unit parse check"
  fi
 }
 remote_name_from_dest() {
  local dest="$1"
  case "${dest}" in
    *:*) printf '%s:\n' "${dest%%:*}" ;;
    *) printf '\n' ;;
  esac
 }
 run_with_timeout() {
  if command -v timeout >/dev/null 2>&1; then
    timeout "${REMOTE_TIMEOUT_SECONDS}" "$@"
  else
    "$@"
  fi
 }
 check_rclone() {
  if [[ -x "${RCLONE_BIN}" ]] || command -v "${RCLONE_BIN}" >/dev/null 2>&1; then
    version="$("${RCLONE_BIN}" version 2>/dev/null | head -n 1 || true)"
    log_pass "rclone available: ${RCLONE_BIN} (${version})"
  else
    log_fail "rclone not found: ${RCLONE_BIN}"
    return
  fi
  if [[ -z "${RCLONE_REMOTE}" ]]; then
    log_info "no rclone remote provided; skipped remote access check"
    return
  fi
  local remote_name
  remote_name="$(remote_name_from_dest "${RCLONE_REMOTE}")"
  if [[ -z "${remote_name}" ]]; then
    log_fail "rclone remote must include a remote name ending in ':': ${RCLONE_REMOTE}"
    return
  fi
  if "${RCLONE_BIN}" listremotes 2>/dev/null | grep -Fxq "${remote_name}"; then
    log_pass "rclone remote is configured: ${remote_name}"
  else
    log_fail "rclone remote is not configured or not visible to this user: ${remote_name}"
    return
  fi
  if run_with_timeout "${RCLONE_BIN}" lsf --max-depth 1 "${RCLONE_REMOTE}" >/dev/null 2>&1; then
    log_pass "rclone remote read check succeeded without printing config: ${RCLONE_REMOTE}"
  else
    log_fail "rclone remote read check failed or timed out: ${RCLONE_REMOTE}"
  fi
 }
 check_target_dir() {
  local label="$1" path="$2"
  if [[ -z "${path}" ]]; then
    log_info "no ${label} directory provided; skipped create/write check"
    return
  fi
  if mkdir -p "${path}" >/dev/null 2>&1 && [[ -d "${path}" && -w "${path}" ]]; then
    log_pass "${label} directory exists and is writable: ${path}"
  else
    log_fail "${label} directory cannot be created or is not writable: ${path}"
  fi
 }
 check_disk_free() {
  local target="$1" label="$2" available_kib min_kib
  if [[ ! -e "${target}" ]]; then
    log_warn "disk target does not exist, skipping ${label}: ${target}"
    return
  fi
  available_kib="$(df -Pk "${target}" | awk 'NR==2 {print $4}')"
  min_kib=$((MIN_FREE_GIB * 1024 * 1024))
  if [[ -n "${available_kib}" && "${available_kib}" -ge "${min_kib}" ]]; then
    log_pass "disk free ok for ${label}: available_kib=${available_kib} min_gib=${MIN_FREE_GIB}"
  else
    log_fail "disk free below threshold for ${label}: available_kib=${available_kib:-unknown} min_gib=${MIN_FREE_GIB}"
  fi
 }
 check_secret_requirements() {
  local files=(
    "config/polymarket_collector.vps.example.yaml"
    "systemd/polymarket-orderbook-collector.service"
    "systemd/polymarket-orderbook-uploader.service"
    "systemd/polymarket-orderbook-uploader.timer"
    "scripts/run_polymarket_collector_cycle.sh"
    "scripts/upload_archive_rclone.sh"
  )
  if grep -E -i '(api[_-]?key|private[_-]?key|mnemonic|wallet|password|client[_-]?secret|access[_-]?token|refresh[_-]?token)' "${files[@]}" >/dev/null 2>&1; then
    log_fail "secret-like credential requirement found in runtime config, units, or scripts"
  else
    log_pass "no API keys, private keys, mnemonics, wallets, or passwords are required by runtime files"
  fi
  log_info "rclone credentials, if used, must remain machine-local outside the repository"
 }
 check_python
 check_required_files
 check_python_compile
 check_shell_syntax
 check_systemd_units
 check_rclone
 check_target_dir "data" "${DATA_DIR}"
 check_target_dir "manifest" "${MANIFEST_DIR}"
 check_target_dir "log" "${LOG_DIR}"
 check_disk_free "." "repository"
 if [[ -n "${DATA_DIR}" && -d "${DATA_DIR}" ]]; then
  check_disk_free "${DATA_DIR}" "data directory"
 fi
 check_secret_requirements
 printf 'SUMMARY failures=%s warnings=%s\n' "${FAILURES}" "${WARNINGS}"
 if [[ "${FAILURES}" -eq 0 ]]; then
  exit 0
 fi
 exit 1
--- a/scripts/vps_runtime_smoke_check.sh
+++ b/scripts/vps_runtime_smoke_check.sh
@ -0,0 +1,279 @@
 #!/usr/bin/env bash
 set -uo pipefail
 APP_DIR="${ORDERBOOKS_APP_DIR:-/opt/orderbooks}"
 DATA_DIR="${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}"
 RAW_DIR="${ORDERBOOKS_OUTPUT_DIR:-${DATA_DIR}/raw_orderbooks}"
 MANIFEST_DIR="${ORDERBOOKS_MANIFEST_DIR:-${DATA_DIR}/manifests}"
 COLLECTOR_SERVICE="${ORDERBOOKS_COLLECTOR_SERVICE:-polymarket-orderbook-collector.service}"
 UPLOADER_SERVICE="${ORDERBOOKS_UPLOADER_SERVICE:-polymarket-orderbook-uploader.service}"
 WAIT_SECONDS="${ORDERBOOKS_SMOKE_WAIT_SECONDS:-900}"
 RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
 EVIDENCE_PATH="${ORDERBOOKS_SMOKE_EVIDENCE_PATH:-${MANIFEST_DIR}/vps_runtime_smoke_${RUN_ID}.json}"
 PYTHON_BIN="${ORDERBOOKS_PYTHON:-python3}"
 usage() {
  cat <<'EOF'
 Usage: scripts/vps_runtime_smoke_check.sh [options]
 Run on the VPS after installing collector/uploader systemd units. The check
 records durable JSON evidence, forces one collector service restart, verifies
 old raw gzip files still parse and keep their checksum, waits for a later valid
 collector cycle, then starts the uploader service and records upload evidence.
 Options:
  --app-dir DIR             App checkout. Default: /opt/orderbooks.
  --data-dir DIR            Data root. Default: /var/lib/orderbooks.
  --raw-dir DIR             Raw output dir. Default: DATA_DIR/raw_orderbooks.
  --manifest-dir DIR        Manifest dir. Default: DATA_DIR/manifests.
  --collector-service NAME  systemd collector service name.
  --uploader-service NAME   systemd uploader service name.
  --wait-seconds N          Max wait for valid cycles. Default: 900.
  --evidence-path PATH      JSON evidence output path.
  --help                    Show this help.
 This script does not delete raw files or manifests. Failures are written to the
 evidence JSON and should be preserved for review.
 EOF
 }
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --app-dir) APP_DIR="$2"; shift 2 ;;
    --data-dir) DATA_DIR="$2"; RAW_DIR="${ORDERBOOKS_OUTPUT_DIR:-$2/raw_orderbooks}"; MANIFEST_DIR="${ORDERBOOKS_MANIFEST_DIR:-$2/manifests}"; shift 2 ;;
    --raw-dir) RAW_DIR="$2"; shift 2 ;;
    --manifest-dir) MANIFEST_DIR="$2"; shift 2 ;;
    --collector-service) COLLECTOR_SERVICE="$2"; shift 2 ;;
    --uploader-service) UPLOADER_SERVICE="$2"; shift 2 ;;
    --wait-seconds) WAIT_SECONDS="$2"; shift 2 ;;
    --evidence-path) EVIDENCE_PATH="$2"; shift 2 ;;
    --help) usage; exit 0 ;;
    *) echo "Unknown argument: $1" >&2; usage >&2; exit 2 ;;
  esac
 done
 mkdir -p "$(dirname "${EVIDENCE_PATH}")"
 PYTHONDONTWRITEBYTECODE=1 "${PYTHON_BIN}" - "$APP_DIR" "$DATA_DIR" "$RAW_DIR" "$MANIFEST_DIR" "$COLLECTOR_SERVICE" "$UPLOADER_SERVICE" "$WAIT_SECONDS" "$EVIDENCE_PATH" <<'PY_SMOKE'
 import datetime as dt
 import gzip
 import hashlib
 import json
 import subprocess
 import sys
 import time
 from pathlib import Path
 app_dir = Path(sys.argv[1])
 data_dir = Path(sys.argv[2])
 raw_dir = Path(sys.argv[3])
 manifest_dir = Path(sys.argv[4])
 collector_service = sys.argv[5]
 uploader_service = sys.argv[6]
 wait_seconds = int(sys.argv[7])
 evidence_path = Path(sys.argv[8])
 started = dt.datetime.now(dt.UTC).replace(microsecond=0)
 checks = []
 failures = []
 def iso_now():
    return dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z')
 def run(command):
    proc = subprocess.run(command, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    item = {
        'command': command,
        'exit_code': proc.returncode,
        'stdout_tail': proc.stdout[-4000:],
        'stderr_tail': proc.stderr[-4000:],
        'ran_at_utc': iso_now(),
    }
    checks.append(item)
    return item
 def sha256(path):
    digest = hashlib.sha256()
    with path.open('rb') as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b''):
            digest.update(chunk)
    return digest.hexdigest()
 def parse_raw(path):
    rows = 0
    first_keys = []
    with gzip.open(path, 'rt', encoding='utf-8') as handle:
        for line in handle:
            if not line.strip():
                continue
            obj = json.loads(line)
            if rows == 0:
                first_keys = sorted(obj.keys())
            rows += 1
    return rows, first_keys
 def collector_manifests():
    if not manifest_dir.exists():
        return []
    return sorted(manifest_dir.glob('polymarket_orderbook_collector_*.json'), key=lambda path: path.stat().st_mtime)
 def validate_collector(path):
    manifest = json.loads(path.read_text(encoding='utf-8'))
    output_files = []
    for item in manifest.get('output_files', []):
        raw_path = Path(item['path'])
        rows, first_keys = parse_raw(raw_path)
        actual_sha = sha256(raw_path)
        output_files.append({
            'path': str(raw_path),
            'bytes': raw_path.stat().st_size,
            'manifest_rows': item.get('rows'),
            'rows_parsed': rows,
            'row_count_matches_manifest': rows == item.get('rows'),
            'manifest_sha256': item.get('sha256'),
            'actual_sha256': actual_sha,
            'sha256_matches_manifest': actual_sha == item.get('sha256'),
            'first_row_keys': first_keys,
            'under_raw_dir': raw_path.resolve().is_relative_to(raw_dir.resolve()),
            'uses_live_sample_path': 'live_sample' in raw_path.parts,
        })
    valid = (
        manifest.get('gate_status') == 'PASS'
        and manifest.get('rows_written', 0) > 0
        and manifest.get('failure_count') == 0
        and not manifest.get('failures')
        and bool(output_files)
        and all(item['rows_parsed'] > 0 and item['row_count_matches_manifest'] and item['sha256_matches_manifest'] and item['under_raw_dir'] and not item['uses_live_sample_path'] for item in output_files)
    )
    return {
        'path': str(path),
        'manifest': manifest,
        'output_files': output_files,
        'valid': valid,
    }
 def latest_valid_after(after_mtime=0):
    deadline = time.time() + wait_seconds
    last_error = None
    while time.time() <= deadline:
        for path in reversed(collector_manifests()):
            if path.stat().st_mtime <= after_mtime:
                continue
            try:
                result = validate_collector(path)
            except Exception as exc:
                last_error = str(exc)
                continue
            if result['valid']:
                return result
            last_error = f"latest candidate invalid: {path}"
        time.sleep(10)
    raise TimeoutError(last_error or f'no valid collector manifest after mtime {after_mtime}')
 def latest_upload_after(after_mtime=0):
    candidates = sorted(manifest_dir.glob('upload_archive_*.json'), key=lambda path: path.stat().st_mtime)
    candidates = [path for path in candidates if path.stat().st_mtime >= after_mtime]
    if not candidates:
        raise FileNotFoundError('no upload_archive_*.json manifest found after uploader run')
    path = candidates[-1]
    manifest = json.loads(path.read_text(encoding='utf-8'))
    verified_count = manifest.get('counts', {}).get('verified', len(manifest.get('verified_files', [])))
    return {
        'path': str(path),
        'manifest': manifest,
        'verified_count': verified_count,
        'valid': manifest.get('operation_status') == 'UPLOAD_VERIFIED' and manifest.get('gate_status') == 'PASS' and manifest.get('rclone', {}).get('copy_exit_code') == 0 and manifest.get('rclone', {}).get('check_exit_code') == 0 and verified_count > 0,
    }
 summary = {
    'schema_name': 'vps_runtime_smoke_result',
    'schema_version': 1,
    'started_at_utc': started.isoformat().replace('+00:00', 'Z'),
    'ended_at_utc': None,
    'gate_status': 'ERROR',
    'production_ready': False,
    'app_dir': str(app_dir),
    'data_dir': str(data_dir),
    'raw_dir': str(raw_dir),
    'manifest_dir': str(manifest_dir),
    'collector_service': collector_service,
    'uploader_service': uploader_service,
    'wait_seconds': wait_seconds,
    'checks': checks,
    'failures': failures,
 }
 try:
    active = run(['systemctl', 'is-active', collector_service])
    if active['exit_code'] != 0:
        failures.append('collector service is not active under systemd')
        raise RuntimeError('collector service not active')
    before = latest_valid_after(0)
    before_mtime = Path(before['path']).stat().st_mtime
    old_raw = before['output_files'][0]
    old_raw_sha = old_raw['actual_sha256']
    old_raw_path = Path(old_raw['path'])
    restart = run(['systemctl', 'restart', collector_service])
    if restart['exit_code'] != 0:
        failures.append('collector service restart command failed')
        raise RuntimeError('restart failed')
    active_after = run(['systemctl', 'is-active', collector_service])
    if active_after['exit_code'] != 0:
        failures.append('collector service is not active after restart')
        raise RuntimeError('collector inactive after restart')
    after = latest_valid_after(before_mtime)
    old_rows_after, _ = parse_raw(old_raw_path)
    old_file_unchanged = sha256(old_raw_path) == old_raw_sha and old_rows_after == old_raw['rows_parsed']
    if not old_file_unchanged:
        failures.append('raw file from before restart changed or stopped parsing')
    upload_start_mtime = time.time()
    upload_run = run(['systemctl', 'start', uploader_service])
    if upload_run['exit_code'] != 0:
        failures.append('uploader service start failed')
    try:
        upload = latest_upload_after(upload_start_mtime - 2)
        if not upload.get('valid'):
            failures.append('uploader did not produce a verified upload manifest with at least one verified file')
    except Exception as exc:
        upload = {'path': None, 'valid': False, 'error': str(exc)}
        failures.append(str(exc))
    collector_logs = run(['journalctl', '-u', collector_service, '-n', '80', '--no-pager'])
    uploader_logs = run(['journalctl', '-u', uploader_service, '-n', '80', '--no-pager'])
    summary.update({
        'before_restart_collector': before,
        'after_restart_collector': after,
        'old_raw_file_unchanged_after_restart': old_file_unchanged,
        'upload_result': upload,
        'collector_log_check_exit_code': collector_logs['exit_code'],
        'uploader_log_check_exit_code': uploader_logs['exit_code'],
    })
    if after['valid'] and old_file_unchanged and upload.get('valid') and not failures:
        summary['gate_status'] = 'PASS'
    else:
        summary['gate_status'] = 'FAIL'
 except Exception as exc:
    failures.append(str(exc))
    summary['exception'] = repr(exc)
 finally:
    summary['ended_at_utc'] = iso_now()
    evidence_path.parent.mkdir(parents=True, exist_ok=True)
    evidence_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + '\n', encoding='utf-8')
 print(f"SMOKE_EVIDENCE={evidence_path}")
 print(f"SMOKE_GATE={summary['gate_status']}")
 if summary['gate_status'] != 'PASS':
    sys.exit(1)
 PY_SMOKE
--- a/systemd/polymarket-orderbook-collector.service
+++ b/systemd/polymarket-orderbook-collector.service
@ -0,0 +1,38 @@
 [Unit]
 Description=Polymarket raw order-book collector cycle
 Documentation=file:/opt/orderbooks/docs/VPS_DEPLOYMENT.md
 After=network-online.target
 Wants=network-online.target
 StartLimitIntervalSec=10min
 StartLimitBurst=20
 [Service]
 Type=simple
 User=orderbooks
 Group=orderbooks
 WorkingDirectory=/opt/orderbooks
 Environment=PYTHONUNBUFFERED=1
 Environment=ORDERBOOKS_APP_DIR=/opt/orderbooks
 Environment=ORDERBOOKS_DATA_DIR=/var/lib/orderbooks
 Environment=ORDERBOOKS_OUTPUT_DIR=/var/lib/orderbooks/raw_orderbooks
 Environment=ORDERBOOKS_PYTHON=/opt/orderbooks/.venv/bin/python
 Environment=ORDERBOOKS_COLLECTOR_CONFIG=/etc/orderbooks/polymarket_collector.vps.yaml
 EnvironmentFile=-/etc/orderbooks/polymarket-orderbook-collector.env
 ExecStart=/bin/bash /opt/orderbooks/scripts/run_polymarket_collector_cycle.sh
 Restart=always
 RestartSec=30s
 TimeoutStopSec=90s
 KillSignal=SIGTERM
 KillMode=control-group
 StandardOutput=journal
 StandardError=journal
 SyslogIdentifier=polymarket-orderbook-collector
 NoNewPrivileges=true
 PrivateTmp=true
 ProtectSystem=strict
 ProtectHome=true
 ReadWritePaths=/var/lib/orderbooks
 StateDirectory=orderbooks
 [Install]
 WantedBy=multi-user.target
--- a/systemd/polymarket-orderbook-uploader.service
+++ b/systemd/polymarket-orderbook-uploader.service
@ -0,0 +1,29 @@
 [Unit]
 Description=Orderbooks archive upload via rclone
 Documentation=file:/opt/orderbooks/docs/GOOGLE_DRIVE_OFFLOAD.md
 After=network-online.target
 Wants=network-online.target
 [Service]
 Type=oneshot
 User=orderbooks
 Group=orderbooks
 WorkingDirectory=/opt/orderbooks
 Environment=ORDERBOOKS_UPLOAD_DATA_DIR=/var/lib/orderbooks
 Environment=ORDERBOOKS_UPLOAD_MANIFEST_DIR=/var/lib/orderbooks/manifests
 Environment=ORDERBOOKS_UPLOAD_RAW_DIR=/var/lib/orderbooks/raw_orderbooks
 Environment=ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600
 Environment=ORDERBOOKS_UPLOAD_RETENTION_DAYS=7
 Environment=ORDERBOOKS_RCLONE_BIN=/usr/bin/rclone
 EnvironmentFile=-/etc/orderbooks/orderbook-uploader.env
 ExecStart=/bin/bash /opt/orderbooks/scripts/upload_archive_rclone.sh --execute
 StandardOutput=journal
 StandardError=journal
 SyslogIdentifier=polymarket-orderbook-uploader
 NoNewPrivileges=true
 PrivateTmp=true
 ProtectSystem=strict
 ProtectHome=true
 ReadWritePaths=/var/lib/orderbooks
 StateDirectory=orderbooks
--- a/systemd/polymarket-orderbook-uploader.timer
+++ b/systemd/polymarket-orderbook-uploader.timer
@ -0,0 +1,12 @@
 [Unit]
 Description=Run orderbooks archive upload periodically
 Documentation=file:/opt/orderbooks/docs/GOOGLE_DRIVE_OFFLOAD.md
 [Timer]
 OnCalendar=hourly
 RandomizedDelaySec=10min
 Persistent=true
 Unit=polymarket-orderbook-uploader.service
 [Install]
 WantedBy=timers.target