Prepare Kubernetes orderbooks deployment
Some checks failed
deploy / deploy (push) Has been cancelled

This commit is contained in:
philipp 2026-04-18 11:23:28 +02:00
commit 284e465588
42 changed files with 8640 additions and 0 deletions

26
.dockerignore Normal file
View file

@ -0,0 +1,26 @@
.git/
.venv/
__pycache__/
*.pyc
*.pyo
.pytest_cache/
.mypy_cache/
.ruff_cache/
artifacts/
data/
reports/
orchestration/
.env
*.env
rclone.conf
**/rclone.conf
*.pem
*.key
*.p12
*.pfx
id_rsa*
id_ed25519*
*mnemonic*
*wallet*
*credential*
*secret*

View file

@ -0,0 +1,162 @@
name: deploy
on:
push:
branches:
- main
workflow_dispatch:
jobs:
deploy:
runs-on: linux-amd64
env:
IMAGE_TAG: ${{ github.sha }}
REGISTRY_HOST: ${{ vars.REGISTRY_HOST }}
PROJECT_NAME: ${{ vars.PROJECT_NAME || 'orderbooks' }}
PROJECT_NAMESPACE: ${{ vars.PROJECT_NAMESPACE || 'orderbooks' }}
PROJECT_DEPLOYMENTS: ${{ vars.PROJECT_DEPLOYMENTS || 'orderbooks-collector' }}
PROJECT_REGISTRY_SECRET_NAME: ${{ vars.PROJECT_REGISTRY_SECRET_NAME || 'orderbooks-registry-creds' }}
REPO_CLONE_URL: ${{ github.server_url }}/${{ github.repository }}.git
steps:
- name: Install tooling
run: |
if command -v git >/dev/null 2>&1 && command -v kubectl >/dev/null 2>&1 && command -v python3 >/dev/null 2>&1; then
exit 0
fi
if command -v apk >/dev/null 2>&1; then
apk add --no-cache git kubectl python3
exit 0
fi
if command -v apt-get >/dev/null 2>&1; then
apt-get update
apt-get install -y git curl ca-certificates python3
curl -fsSLo /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
chmod +x /usr/local/bin/kubectl
exit 0
fi
echo "missing git/kubectl/python3 and no supported package manager found" >&2
exit 1
- name: Prepare workspace
run: |
workspace_root="${RUNNER_TEMP:-/tmp}"
workspace_dir="$(mktemp -d "${workspace_root%/}/orderbooks-deploy-XXXXXX")"
echo "WORKSPACE_DIR=$workspace_dir" >> "$GITHUB_ENV"
echo "runner workspace: $workspace_dir"
- name: Load kubeconfig
run: |
mkdir -p "$HOME/.kube"
printf '%s' '${{ secrets.KUBECONFIG_B64 }}' | base64 -d > "$HOME/.kube/config"
kubectl get ns
- name: Checkout repo
env:
REPO_TOKEN: ${{ github.token }}
run: |
git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" clone --depth=1 "${REPO_CLONE_URL}" "$WORKSPACE_DIR"
cd "$WORKSPACE_DIR"
current_sha="$(git rev-parse HEAD)"
if [ "$current_sha" != "$GITHUB_SHA" ]; then
git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" fetch --depth=1 origin "${GITHUB_SHA}"
git checkout --detach "${GITHUB_SHA}"
else
git checkout --detach "$current_sha"
fi
git rev-parse HEAD
- name: Resolve deployment settings
run: |
if [ -z "${REGISTRY_HOST:-}" ]; then
echo "REGISTRY_HOST repo variable is required" >&2
exit 1
fi
IMAGE="$REGISTRY_HOST/$PROJECT_NAME:$IMAGE_TAG"
BUILD_JOB="image-build-$(printf '%s' "$GITHUB_SHA" | cut -c1-12)"
{
echo "IMAGE=$IMAGE"
echo "BUILD_JOB=$BUILD_JOB"
} >> "$GITHUB_ENV"
- name: Ensure namespace exists
run: |
kubectl apply -f "$WORKSPACE_DIR/deploy/k8s/base/namespace.yaml"
- name: Build and push image in-cluster
env:
REPO_TOKEN: ${{ github.token }}
run: |
kubectl -n "$PROJECT_NAMESPACE" delete job "$BUILD_JOB" --ignore-not-found=true
cat <<EOF | kubectl apply -f -
apiVersion: batch/v1
kind: Job
metadata:
name: ${BUILD_JOB}
namespace: ${PROJECT_NAMESPACE}
spec:
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
spec:
restartPolicy: Never
volumes:
- name: workspace
emptyDir: {}
- name: registry-creds
secret:
secretName: ${PROJECT_REGISTRY_SECRET_NAME}
items:
- key: .dockerconfigjson
path: config.json
initContainers:
- name: checkout
image: alpine/git:2.47.2
env:
- name: REPO_TOKEN
value: ${REPO_TOKEN}
- name: REPO_CLONE_URL
value: ${REPO_CLONE_URL}
- name: GITHUB_SHA
value: ${GITHUB_SHA}
command: ["/bin/sh", "-lc"]
args:
- >-
git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" clone --depth=1 "${REPO_CLONE_URL}" /workspace &&
cd /workspace &&
git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" fetch --depth=1 origin "${GITHUB_SHA}" &&
git checkout --detach "${GITHUB_SHA}"
volumeMounts:
- name: workspace
mountPath: /workspace
containers:
- name: kaniko
image: gcr.io/kaniko-project/executor:v1.23.2-debug
args:
- --context=/workspace
- --dockerfile=/workspace/Dockerfile
- --destination=${IMAGE}
- --cache=false
volumeMounts:
- name: workspace
mountPath: /workspace
- name: registry-creds
mountPath: /kaniko/.docker
EOF
kubectl -n "$PROJECT_NAMESPACE" wait --for=condition=Complete --timeout=20m "job/$BUILD_JOB"
kubectl -n "$PROJECT_NAMESPACE" logs "job/$BUILD_JOB"
- name: Apply release manifests and wait for rollout
run: |
kubectl kustomize "$WORKSPACE_DIR/deploy/k8s/base" \
| IMAGE="$IMAGE" python3 -c 'import os, sys; sys.stdout.write(sys.stdin.read().replace("registry.doran.133011.xyz/orderbooks:bootstrap", os.environ["IMAGE"]))' \
| kubectl apply -f -
printf '%s' "$PROJECT_DEPLOYMENTS" | tr ',' '\n' \
| while IFS= read -r deployment; do
[ -n "$deployment" ] || continue
kubectl -n "$PROJECT_NAMESPACE" set image "deployment/$deployment" "*=$IMAGE"
kubectl -n "$PROJECT_NAMESPACE" rollout status "deployment/$deployment" --timeout=300s
done

43
.gitignore vendored Normal file
View file

@ -0,0 +1,43 @@
# Local runtime data and evidence stay local
data/
artifacts/
reports/
orchestration/
# Python/cache/build noise
__pycache__/
*.py[cod]
.pytest_cache/
.mypy_cache/
.ruff_cache/
*.egg-info/
build/
dist/
# Environments and local config
.venv/
.env
*.env
!.dockerignore
!.gitignore
# Kubernetes/rclone/secret material
kubeconfig*
*.kubeconfig
rclone.conf
**/rclone.conf
*.pem
*.key
*.p12
*.pfx
id_rsa*
id_ed25519*
*mnemonic*
*wallet*
*credential*
*secret*
# Editor/OS noise
.DS_Store
.idea/
.vscode/

91
AGENTS.md Normal file
View file

@ -0,0 +1,91 @@
# Agent Instructions
Project: Cross-Market Live Orderbook Archive
This repository exists to preserve live market microstructure data that is usually lost: order books, spreads, liquidity, depth, timestamps, request metadata, and enough raw context to later decide whether a trading idea was observable, fillable, and reproducible at the time.
The first market is Polymarket. Future markets may include NEAR-related venues and other prediction or crypto markets, but do not build generic multi-market infrastructure before the second market exists.
## Active Collaboration Model
This project uses a two-role workflow:
- `orchestrator`: coordinates checkpoints with the user, keeps scope narrow, records decisions, reviews evidence, states gates, and decides the next smallest step.
- `builder`: works in a separate session to implement the active checkpoint artifacts, run commands, collect evidence, and write manifests/reports.
The current primary chat session is the `orchestrator`. The orchestrator should not silently become the builder unless the user explicitly asks. The builder should treat `AGENTS.md`, `ROADMAP.md`, `docs/METHODOLOGY.md`, and the active checkpoint report as the durable source of instructions.
Hand-offs between orchestrator and builder must be written to disk under `orchestration/` or `reports/checkpoints/` when they contain decisions, scope changes, endpoint findings, or validation results. Chat-only instructions are not enough for project-critical state.
## Non-Negotiable Rules
1. Preserve raw data first. Raw API and websocket payloads are the source of truth. Derived datasets are secondary and must reference raw files.
2. No trading. Do not add order placement, signing, private-key handling, wallet logic, strategy execution, or bot behavior.
3. No secrets in the repo. Never commit API keys, rclone credentials, wallet material, cookies, or private endpoints.
4. Every checkpoint needs durable evidence on disk: code or docs, config or run instructions, manifest/report, and validation evidence.
5. Do not claim success without commands, outputs, files, checksums, or real collected data to support the claim.
6. Do not delete mistakes. If an artifact is wrong, misleading, partial, or deprecated, preserve it and label it with a reason and replacement.
7. Keep the scope narrow. No dashboard, database, ML, strategy, backtest, or generic framework until the roadmap gate allows it.
8. Public data only unless a later checkpoint explicitly documents why authenticated public-data access is required.
9. "Production-ready" is forbidden until the collector has completed a documented 24h soak test with acceptable quality.
## Expected Workflow
For each checkpoint:
1. Define the smallest useful checkpoint.
2. Build only what is needed for that checkpoint.
3. Validate with real commands and, when applicable, real public data.
4. Write a machine-readable manifest and a short markdown note.
5. State PASS, FAIL, or BLOCKED.
6. Identify the strongest fake-progress risk.
7. Recommend the next smallest step.
8. Stop only when a real user or orchestrator decision is needed.
## Repository Conventions
- `scripts/`: executable probes, discovery scripts, collectors, normalizers, and upload helpers.
- `config/`: example configuration only. Real secrets and machine-local config stay outside git.
- `docs/`: durable methodology, data contracts, operational runbooks, and endpoint notes.
- `orchestration/prompts/`: prompts and templates used by future agents.
- `data/probes/`: bounded endpoint probe outputs and probe notes.
- `data/discovery/`: market discovery outputs and manifests.
- `data/live_sample/`: short sample collector runs.
- `data/normalized_sample/`: derived sample outputs generated from raw samples.
- `data/manifests/`: machine-readable manifests for probes, collectors, normalization, uploads, and checkpoints.
- `reports/`: human-readable checkpoint, soak test, and incident reports.
- `systemd/`: VPS runtime units when added.
The initial Polymarket implementation should remain simple scripts until the collector works. Introduce `collectors/<market_name>/` only when adding a second market or when duplication proves painful.
## Artifact Status Labels
Every durable artifact should be treated as one of:
- `valid`: current and usable.
- `partial`: useful but incomplete.
- `deprecated`: superseded by a newer artifact.
- `invalid`: known to be wrong or misleading.
When marking an artifact `deprecated` or `invalid`, write a sibling markdown note or manifest entry with:
- original artifact path
- status
- reason
- replacement path, if any
- labeled_at_utc
- labeled_by
Do not remove the original artifact unless the user explicitly asks and there is a written reason.
## Adding New Market Connectors Later
Before adding a second market, Polymarket must have working discovery, raw order-book collection, Google Drive offload, and a 24h soak test.
When the gate is met:
1. Create `collectors/<market_name>/` for market-specific code.
2. Keep shared code minimal and concrete.
3. Reuse the same raw-first file layout and manifest format.
4. Document endpoint quirks, timestamp semantics, rate limits, and schema differences in `docs/`.
5. Avoid abstract base classes until at least two real collectors expose repeated code that is painful to maintain.

28
Dockerfile Normal file
View file

@ -0,0 +1,28 @@
FROM python:3.12-slim
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
ORDERBOOKS_APP_DIR=/app \
ORDERBOOKS_DATA_DIR=/var/lib/orderbooks \
ORDERBOOKS_PYTHON=python3
RUN apt-get update \
&& apt-get install -y --no-install-recommends bash ca-certificates rclone \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd --system --gid 10001 orderbooks \
&& useradd --system --uid 10001 --gid 10001 --home-dir /var/lib/orderbooks --shell /usr/sbin/nologin orderbooks
WORKDIR /app
COPY AGENTS.md ROADMAP.md ./
COPY config/ config/
COPY docs/ docs/
COPY scripts/ scripts/
RUN chmod +x scripts/*.sh \
&& mkdir -p /var/lib/orderbooks/discovery /var/lib/orderbooks/raw_orderbooks /var/lib/orderbooks/manifests \
&& chown -R orderbooks:orderbooks /var/lib/orderbooks /app
USER 10001:10001
CMD ["/bin/bash", "/app/scripts/run_polymarket_collector_loop.sh"]

212
ROADMAP.md Normal file
View file

@ -0,0 +1,212 @@
# Roadmap
Project: Cross-Market Live Orderbook Archive
Goal: build a reliable, minimal, always-on archive of live market microstructure data so future research agents can test whether strategies were actually observable, fillable, and reproducible in real time.
The roadmap is checkpoint-driven. Each checkpoint must leave durable artifacts, validation evidence, and an explicit gate result.
## Current Status
- Latest completed checkpoint: Checkpoint 7, Google Drive Offload
- Latest gate: PASS
- Next checkpoint: Checkpoint 8, 24h Soak Test Plan
- Initial market: Polymarket
- Future market work: gated until Polymarket is stable
## Checkpoint 1: Project Scaffold And Methodology
Goal: create the minimum repository structure and rules that keep future agents on track.
Artifacts:
- `AGENTS.md`
- `ROADMAP.md`
- `docs/METHODOLOGY.md`
- `docs/DATA_CONTRACT.md`
- `docs/OPERATIONS.md`
- `orchestration/prompts/`
Requirements:
- Define project goal.
- Define anti-fake-progress rules.
- Define raw-first storage policy.
- Define checkpoint reporting format.
- Define no-trading/no-private-key policy.
- Define how to label deprecated or misleading artifacts instead of deleting them.
- Define how new market connectors should be added later.
Pass condition: the repo contains durable project rules and the next checkpoint is specific enough to execute.
## Checkpoint 2: Polymarket Public Data Source Probe
Goal: determine exactly which public Polymarket endpoints can support live collection.
Questions:
- How to discover active Polymarket markets?
- How to filter BTC up/down markets?
- How to resolve conditionId and token IDs?
- How to fetch current order book for one token?
- Is there a batch order-book endpoint?
- Is there a market websocket for order-book updates?
- Is there a trade websocket or recent trades endpoint?
- What rate limits are documented or observed?
- What fields are returned?
- What timestamps exist?
Artifacts:
- `scripts/probe_polymarket_public_sources.py`
- `data/probes/polymarket_public_sources_probe_v1.json`
- `data/probes/polymarket_public_sources_probe_v1.md`
Pass condition: we know the exact endpoint set and can fetch at least one active market metadata record and one current order book.
## Checkpoint 3: Minimal BTC Market Discovery
Goal: build a small script that finds active BTC up/down Polymarket markets and resolves both outcome token IDs.
Artifacts:
- `scripts/discover_polymarket_btc_markets.py`
- `data/discovery/polymarket_btc_markets_latest.json`
- `data/discovery/polymarket_btc_markets_manifest.json`
- `data/discovery/polymarket_btc_markets.md`
Requirements:
- Public endpoints only.
- No trading.
- No API keys unless strictly needed for public data.
- Never store secrets in the repo.
- Preserve raw metadata responses.
- Write normalized market records with slug, question, conditionId, token IDs, outcomes, times, status, source, and `fetched_at_utc`.
Pass condition: the script reliably outputs currently active BTC markets with token IDs.
## Checkpoint 4: Minimal Orderbook Snapshot Collector
Goal: collect raw order-book snapshots for active BTC markets at a fixed interval.
Artifacts:
- `scripts/collect_polymarket_orderbooks.py`
- `config/polymarket_collector.example.yaml`
- `data/live_sample/...`
- `data/manifests/orderbook_collector_sample_manifest.json`
- `docs/POLYMARKET_COLLECTOR.md`
Requirements:
- Collect active BTC markets only.
- Fetch order books for both outcome tokens.
- Store raw API responses as gzip JSONL.
- Add local `collected_at_utc`, collector version, endpoint URL, and request params.
- Rotate files by hour or run.
- Include a manifest with timing, markets, request counts, status codes, rows, output files, and checksums.
- Handle graceful shutdown and rate limits.
- Do not add a database.
Pass condition: a 5-10 minute sample run creates valid compressed raw snapshots and a manifest.
## Checkpoint 5: Normalized Snapshot Extract
Goal: create a derived normalized dataset from raw snapshots while preserving raw files as source of truth.
Artifacts:
- `scripts/normalize_polymarket_orderbooks.py`
- `data/normalized_sample/...`
- `data/manifests/orderbook_normalization_sample_manifest.json`
- `docs/ORDERBOOK_SCHEMA.md`
Pass condition: a sample raw file can be normalized and basic sanity checks pass.
## Checkpoint 6: VPS Runtime Package
Goal: make the collector deployable on a small VPS.
Artifacts:
- `systemd/polymarket-orderbook-collector.service`
- `config/polymarket_collector.vps.example.yaml`
- `scripts/run_polymarket_collector_cycle.sh`
- `docs/VPS_DEPLOYMENT.md`
Uploader service and timer units are deferred to Checkpoint 7 with Google Drive
offload. Creating empty uploader units in Checkpoint 6 would be fake progress.
Pass condition: a user can follow docs on a VPS and run the collector.
## Checkpoint 7: Google Drive Offload
Goal: add periodic upload to Google Drive using `rclone`.
Artifacts:
- `scripts/upload_archive_rclone.sh`
- `config/rclone.example.md`
- `docs/GOOGLE_DRIVE_OFFLOAD.md`
- sample upload manifest format
Pass condition: a dry-run and a real small test upload succeed and are documented.
## Checkpoint 8: 24h Soak Test Plan
Goal: run the collector for a real 24h period and validate reliability.
Artifacts:
- `reports/soak_test_YYYY-MM-DD.md`
- `data/manifests/...`
Metrics:
- uptime
- markets tracked
- total snapshots
- missed interval estimate
- API errors
- rate limits
- file sizes
- compression ratio
- Google Drive upload status
- restart behavior
- disk usage
- data quality checks
Pass condition: a 24h run completes with acceptable data quality and documented issues.
## Checkpoint 9: Add Second Market Only After Polymarket Is Stable
Goal: prepare for NEAR or another market only after Polymarket collector reliability is proven.
Do not start this checkpoint until:
- Polymarket discovery works.
- Polymarket order-book collection works.
- Google Drive offload works.
- The 24h soak test is complete.
Architecture principles:
- Use `collectors/<market_name>/` only when adding the second market.
- Keep shared code minimal.
- Avoid abstract base classes until duplication is painful.
- Keep raw-first, normalized-second, manifest-always file format consistent across markets.
## Anti-Fake-Progress Gates
- No dashboard before 24h data reliability.
- No database before the file archive becomes painful.
- No strategy or backtest code in this project.
- No live trading.
- No generic multi-market abstraction before the second market exists.
- No claiming "production-ready" before a 24h soak test.
- No deleting bad artifacts; label them deprecated or invalid and write why.
## Next Smallest Step
Checkpoint 2 is next. It should inspect official Polymarket docs and perform bounded public endpoint probes to determine the exact live collection sources, schemas, timestamps, and rate-limit behavior.

View file

@ -0,0 +1,20 @@
# Example config for the bounded Checkpoint 4 Polymarket order-book sample.
# This file contains no secrets. The collector reads only public endpoints.
discovery_path: data/discovery/polymarket_btc_markets_latest.json
output_dir: data/live_sample
manifest_path: data/manifests/orderbook_collector_sample_manifest.json
# Keep the default sample deliberately small to avoid unnecessary endpoint load.
market_limit: 2
interval_seconds: 30
duration_seconds: 300
clob_books_url: https://clob.polymarket.com/books
request_timeout_seconds: 15
max_retries: 2
backoff_seconds: 2
# Do not start tracking markets too close to their end time. Default covers
# the 5-minute sample duration plus a 2-minute buffer.
market_end_safety_seconds: 420

View file

@ -0,0 +1,17 @@
# Checkpoint 6 VPS example config for the raw Polymarket order-book collector.
# Copy to /etc/orderbooks/polymarket_collector.vps.yaml on a VPS and edit paths
# if the service uses a different data directory.
discovery_path: /var/lib/orderbooks/discovery/polymarket_btc_markets_latest.json
output_dir: /var/lib/orderbooks/raw_orderbooks
manifest_path: /var/lib/orderbooks/manifests/polymarket_orderbook_collector_latest.json
market_limit: 2
interval_seconds: 30
duration_seconds: 300
market_end_safety_seconds: 420
clob_books_url: https://clob.polymarket.com/books
request_timeout_seconds: 15
max_retries: 2
backoff_seconds: 2

76
config/rclone.example.md Normal file
View file

@ -0,0 +1,76 @@
# rclone Configuration Example
Status: valid
This file documents the expected `rclone` setup for Checkpoint 7. It is not an
`rclone.conf` file and must not be copied into the repository with private auth
material.
## Remote Name
The examples use this remote path:
```text
gdrive:orderbooks/polymarket
```
You may choose another remote name or folder. The uploader reads the destination
from:
```text
ORDERBOOKS_RCLONE_DEST
```
For the systemd service, set it in:
```text
/etc/orderbooks/orderbook-uploader.env
```
Example:
```text
ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
```
Do not place private auth files, browser tokens, API keys, wallet material, or
session material in this repository.
## Configure Google Drive Outside The Repo
Install `rclone` on the VPS, then configure the remote as the service user or
with a root-managed config path that the service can read:
```sh
sudo apt-get install -y rclone
sudo -u orderbooks rclone config
sudo -u orderbooks rclone lsd gdrive:
```
If the service user uses the default rclone config path, keep that file outside
the repository under the service user's home/config directory.
## Uploader Environment File
Create:
```text
/etc/orderbooks/orderbook-uploader.env
```
Minimal example:
```text
ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
```
Optional overrides:
```text
ORDERBOOKS_UPLOAD_DATA_DIR=/var/lib/orderbooks
ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600
ORDERBOOKS_UPLOAD_RETENTION_DAYS=7
ORDERBOOKS_RCLONE_BIN=/usr/bin/rclone
```
The environment file belongs on the VPS. Do not commit a machine-local version.

View file

@ -0,0 +1,25 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: orderbooks-collector-config
namespace: orderbooks
labels:
app.kubernetes.io/name: orderbooks
app.kubernetes.io/part-of: orderbooks
app.kubernetes.io/component: collector
app.kubernetes.io/managed-by: kustomize
data:
polymarket_collector.yaml: |
discovery_path: /var/lib/orderbooks/discovery/polymarket_btc_markets_latest.json
output_dir: /var/lib/orderbooks/raw_orderbooks
manifest_path: /var/lib/orderbooks/manifests/polymarket_orderbook_collector_latest.json
market_limit: 2
interval_seconds: 30
duration_seconds: 300
market_end_safety_seconds: 420
clob_books_url: https://clob.polymarket.com/books
request_timeout_seconds: 15
max_retries: 2
backoff_seconds: 2

View file

@ -0,0 +1,92 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: orderbooks-uploader
namespace: orderbooks
labels:
app.kubernetes.io/name: orderbooks
app.kubernetes.io/part-of: orderbooks
app.kubernetes.io/component: uploader
spec:
schedule: "*/15 * * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 0
ttlSecondsAfterFinished: 86400
template:
metadata:
labels:
app.kubernetes.io/name: orderbooks
app.kubernetes.io/part-of: orderbooks
app.kubernetes.io/component: uploader
spec:
restartPolicy: Never
imagePullSecrets:
- name: orderbooks-registry-creds
securityContext:
runAsNonRoot: true
runAsUser: 10001
runAsGroup: 10001
fsGroup: 10001
fsGroupChangePolicy: OnRootMismatch
containers:
- name: uploader
image: registry.doran.133011.xyz/orderbooks:bootstrap
imagePullPolicy: IfNotPresent
command:
- /bin/bash
- /app/scripts/upload_archive_rclone.sh
- --execute
env:
- name: ORDERBOOKS_DATA_DIR
value: /var/lib/orderbooks
- name: ORDERBOOKS_UPLOAD_DATA_DIR
value: /var/lib/orderbooks
- name: ORDERBOOKS_UPLOAD_RAW_DIR
value: /var/lib/orderbooks/raw_orderbooks
- name: ORDERBOOKS_UPLOAD_SOURCE_MANIFEST_DIR
value: /var/lib/orderbooks/manifests
- name: ORDERBOOKS_UPLOAD_MANIFEST_DIR
value: /var/lib/orderbooks/manifests
- name: ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS
value: "600"
- name: ORDERBOOKS_UPLOAD_RETENTION_DAYS
value: "7"
- name: ORDERBOOKS_RCLONE_BIN
value: /usr/bin/rclone
- name: ORDERBOOKS_RCLONE_DEST
value: gdrive:orderbooks/polymarket
- name: RCLONE_CONFIG
value: /etc/rclone/rclone.conf
volumeMounts:
- name: orderbooks-data
mountPath: /var/lib/orderbooks
- name: rclone-config
mountPath: /etc/rclone/rclone.conf
subPath: rclone.conf
readOnly: true
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
volumes:
- name: orderbooks-data
persistentVolumeClaim:
claimName: orderbooks-data
- name: rclone-config
secret:
secretName: orderbooks-rclone-config
items:
- key: rclone.conf
path: rclone.conf

View file

@ -0,0 +1,86 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: orderbooks-collector
namespace: orderbooks
labels:
app.kubernetes.io/name: orderbooks
app.kubernetes.io/part-of: orderbooks
app.kubernetes.io/component: collector
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app.kubernetes.io/name: orderbooks
app.kubernetes.io/component: collector
template:
metadata:
labels:
app.kubernetes.io/name: orderbooks
app.kubernetes.io/part-of: orderbooks
app.kubernetes.io/component: collector
spec:
terminationGracePeriodSeconds: 120
imagePullSecrets:
- name: orderbooks-registry-creds
securityContext:
runAsNonRoot: true
runAsUser: 10001
runAsGroup: 10001
fsGroup: 10001
fsGroupChangePolicy: OnRootMismatch
containers:
- name: collector
image: registry.doran.133011.xyz/orderbooks:bootstrap
imagePullPolicy: IfNotPresent
command:
- /bin/bash
- /app/scripts/run_polymarket_collector_loop.sh
env:
- name: ORDERBOOKS_APP_DIR
value: /app
- name: ORDERBOOKS_PYTHON
value: python3
- name: ORDERBOOKS_DATA_DIR
value: /var/lib/orderbooks
- name: ORDERBOOKS_COLLECTOR_CONFIG
value: /etc/orderbooks/polymarket_collector.yaml
- name: ORDERBOOKS_DISCOVERY_DIR
value: /var/lib/orderbooks/discovery
- name: ORDERBOOKS_OUTPUT_DIR
value: /var/lib/orderbooks/raw_orderbooks
- name: ORDERBOOKS_MANIFEST_DIR
value: /var/lib/orderbooks/manifests
- name: ORDERBOOKS_LOOP_SLEEP_SECONDS
value: "15"
volumeMounts:
- name: orderbooks-data
mountPath: /var/lib/orderbooks
- name: collector-config
mountPath: /etc/orderbooks/polymarket_collector.yaml
subPath: polymarket_collector.yaml
readOnly: true
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
volumes:
- name: orderbooks-data
persistentVolumeClaim:
claimName: orderbooks-data
- name: collector-config
configMap:
name: orderbooks-collector-config
items:
- key: polymarket_collector.yaml
path: polymarket_collector.yaml

View file

@ -0,0 +1,9 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: orderbooks
resources:
- namespace.yaml
- configmap.yaml
- pvc.yaml
- deployment-collector.yaml
- cronjob-uploader.yaml

View file

@ -0,0 +1,7 @@
apiVersion: v1
kind: Namespace
metadata:
name: orderbooks
labels:
app.kubernetes.io/name: orderbooks
app.kubernetes.io/part-of: orderbooks

15
deploy/k8s/base/pvc.yaml Normal file
View file

@ -0,0 +1,15 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: orderbooks-data
namespace: orderbooks
labels:
app.kubernetes.io/name: orderbooks
app.kubernetes.io/part-of: orderbooks
spec:
accessModes:
- ReadWriteOnce
storageClassName: local-path
resources:
requests:
storage: 10Gi

168
docs/DATA_CONTRACT.md Normal file
View file

@ -0,0 +1,168 @@
# Data Contract
The archive is raw-first. Raw market data must be preserved before normalization, aggregation, upload, or analysis.
## Storage Principles
- Store the raw response payload exactly as received whenever practical.
- Add collector metadata beside the raw payload, not inside it.
- Use UTC timestamps in ISO 8601 format with a `Z` suffix.
- Use gzip JSONL for high-frequency snapshot data.
- Rotate live collection files by hour or run.
- Include checksums in manifests for all closed files.
- Keep normalized files derived and traceable back to raw files.
- Never store secrets, cookies, private keys, wallet material, or authenticated session state.
## Directory Layout
Initial expected layout:
```text
data/
probes/
discovery/
live_sample/
normalized_sample/
manifests/
reports/
checkpoints/
```
Future sustained collection layout:
```text
data/
raw/
polymarket/
orderbooks/
YYYY/
MM/
DD/
HH/
polymarket_orderbooks_YYYYMMDDTHHMMSSZ.jsonl.gz
normalized/
polymarket/
orderbooks/
YYYY/
MM/
DD/
polymarket_orderbooks_normalized_YYYYMMDD.jsonl.gz
manifests/
```
Do not create a database until compressed file archives are proven painful.
## Raw Orderbook Snapshot Envelope
Checkpoint 4 should store one JSON object per line using this envelope or a documented successor:
```json
{
"schema_name": "raw_orderbook_snapshot",
"schema_version": 1,
"collector": {
"name": "polymarket_orderbook_collector",
"version": "0.1.0"
},
"market": {
"market_name": "polymarket",
"market_slug": "example-slug",
"condition_id": "0x...",
"token_id": "123",
"outcome": "Yes"
},
"collection": {
"collected_at_utc": "2026-04-14T20:53:49Z",
"sequence": 1
},
"request": {
"method": "GET",
"url": "https://example.invalid/orderbook",
"params": {
"token_id": "123"
},
"status_code": 200,
"duration_ms": 123
},
"raw": {}
}
```
`raw` is the unmodified response payload. If the endpoint returns text or bytes, record encoding and store a lossless representation.
## Discovery Record Fields
Checkpoint 3 normalized market records should include:
- `market_name`
- `market_slug`
- `title` or `question`
- `condition_id`
- `tokens`
- `outcomes`
- `start_time_utc`, if available
- `end_time_utc`, if available
- `active`
- `closed`
- `endpoint_source`
- `fetched_at_utc`
- `raw_ref`
`tokens` should preserve the mapping between outcome labels and token IDs.
## Normalized Snapshot Fields
Checkpoint 5 normalized records should include:
- `market_name`
- `market_slug`
- `condition_id`
- `token_id`
- `outcome`
- `collected_at_utc`
- `best_bid`
- `best_ask`
- `spread`
- `midpoint`
- `bid_depth_total`
- `ask_depth_total`
- `bid_depth_within_1c`
- `ask_depth_within_1c`
- `bid_depth_within_2c`
- `ask_depth_within_2c`
- `bid_depth_within_5c`
- `ask_depth_within_5c`
- `raw_file`
- `raw_line_number`, when feasible
Normalized data is invalid if it cannot reference the raw source record.
## Manifest Requirements
Collection and transformation manifests should include:
- manifest schema name and version
- checkpoint or process name
- start and end timestamps
- market names and market IDs tracked
- input files
- output files
- request counts
- success and failure counts
- status-code counts
- row counts
- checksums for closed files
- command used
- config path or config digest
- warnings and known gaps
- gate status
Checksums should use SHA-256 unless a later report explains why another hash is used.
## Timestamp Policy
- `collected_at_utc`: local collector timestamp taken as close as possible to receipt of data.
- `fetched_at_utc`: timestamp for metadata or discovery fetches.
- Endpoint-provided timestamps must be preserved under their original field names in `raw`.
- If endpoint timestamp semantics are unclear, write the ambiguity into the probe report.

View file

@ -0,0 +1,294 @@
# Google Drive Offload
Status: valid
This document covers Checkpoint 7: offloading closed raw collector files and
manifests to Google Drive with `rclone`.
This checkpoint does not prove production readiness or 24/7 reliability. A real
small upload must be run with a configured remote, and the later 24h soak test
must still pass.
## Scope
Included:
- `scripts/upload_archive_rclone.sh`
- `systemd/polymarket-orderbook-uploader.service`
- `systemd/polymarket-orderbook-uploader.timer`
- dry-run mode by default
- real upload only with `--execute`
- rclone verification with `rclone check`
- per-run upload manifests
- optional local cleanup only after successful verification
Excluded:
- dashboards
- databases
- strategies or backtests
- trading, signing, order placement, or wallet logic
- hardcoded private auth material
## Install rclone
On Ubuntu or Debian:
```sh
sudo apt-get update
sudo apt-get install -y rclone
```
Confirm:
```sh
rclone version
```
## Configure A Google Drive Remote
Configure the remote outside this repository. For a service-user setup:
```sh
sudo -u orderbooks rclone config
sudo -u orderbooks rclone lsd gdrive:
```
The example remote path is:
```text
gdrive:orderbooks/polymarket
```
Any valid `rclone` destination may be used. The uploader reads it from:
```text
ORDERBOOKS_RCLONE_DEST
```
For systemd, create:
```text
/etc/orderbooks/orderbook-uploader.env
```
Example:
```text
ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
```
Do not commit the machine-local rclone config or any private auth material.
## What Gets Uploaded
By default the script targets:
| Source | Default path |
| --- | --- |
| raw collector files | `/var/lib/orderbooks/raw_orderbooks` |
| collector manifests | `/var/lib/orderbooks/manifests` |
It does not target normalized sample files by default.
Files modified within the last 10 minutes are skipped to avoid active collector
files:
```text
ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600
```
The script preserves repository/data-directory relative paths on the remote. For
example:
```text
/var/lib/orderbooks/raw_orderbooks/polymarket/orderbooks/<run_id>/file.jsonl.gz
```
uploads to:
```text
<remote>/raw_orderbooks/polymarket/orderbooks/<run_id>/file.jsonl.gz
```
## Dry Run
Dry-run is the default. It plans files, stages a temporary copy, invokes
`rclone copy --dry-run`, and writes an upload manifest.
Example for a VPS:
```sh
/opt/orderbooks/scripts/upload_archive_rclone.sh \
--data-dir /var/lib/orderbooks \
--dest "$ORDERBOOKS_RCLONE_DEST"
```
Example against the repository sample data:
```sh
scripts/upload_archive_rclone.sh \
--data-dir data \
--dest gdrive:orderbooks/polymarket/checkpoint7-test \
--manifest-path data/manifests/upload_archive_real_test_dry_run_manifest.json \
--min-age-seconds 0 \
--rclone-bin /usr/bin/rclone
```
Dry-run does not prove remote write access.
## Execute Upload
Run a real upload only after the remote is configured and the dry-run plan looks
right:
```sh
/opt/orderbooks/scripts/upload_archive_rclone.sh \
--execute \
--data-dir /var/lib/orderbooks \
--dest "$ORDERBOOKS_RCLONE_DEST"
```
The script runs:
```text
rclone copy <staged files> <remote> --checksum
rclone check <staged files> <remote> --one-way --checksum
```
The upload gate is `PASS` only when the copy succeeds and verification succeeds.
## Retention And Cleanup
Local files are kept by default, even after upload verification.
Cleanup requires an explicit flag:
```sh
/opt/orderbooks/scripts/upload_archive_rclone.sh \
--execute \
--cleanup-after-verify \
--retention-days 7 \
--data-dir /var/lib/orderbooks \
--dest "$ORDERBOOKS_RCLONE_DEST"
```
Cleanup deletes only files that were selected for upload, uploaded, verified, and
older than the retention window. The default retention window is 7 days.
## Upload Manifest
Each run writes a manifest such as:
```text
/var/lib/orderbooks/manifests/upload_archive_YYYYMMDDTHHMMSSZ.json
```
The manifest records:
- planned files
- attempted files
- dry-run files
- uploaded files
- verified files
- skipped open or recent files
- retained local files
- deleted local files
- SHA-256 checksums
- command mode
- start/end time
- rclone copy/check exit codes
- gate status
For this repository, the sample manifest path is:
```text
data/manifests/upload_archive_sample_manifest.json
```
The verified Checkpoint 7 real-test manifest is:
```text
data/manifests/upload_archive_real_test_manifest.json
```
## systemd Timer
Install the unit files:
```sh
sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.service /etc/systemd/system/polymarket-orderbook-uploader.service
sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.timer /etc/systemd/system/polymarket-orderbook-uploader.timer
sudo systemctl daemon-reload
```
Create the environment file:
```sh
sudo install -o root -g orderbooks -m 0640 /dev/null /etc/orderbooks/orderbook-uploader.env
sudo editor /etc/orderbooks/orderbook-uploader.env
```
At minimum, set:
```text
ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
```
Enable the timer:
```sh
sudo systemctl enable --now polymarket-orderbook-uploader.timer
```
Run one upload immediately:
```sh
sudo systemctl start polymarket-orderbook-uploader.service
```
## Logs
Use the systemd journal:
```sh
sudo systemctl status polymarket-orderbook-uploader.service
sudo journalctl -u polymarket-orderbook-uploader.service -f
sudo systemctl list-timers polymarket-orderbook-uploader.timer
```
## Current Checkpoint 7 Result
Initial local validation was blocked when `rclone` was unavailable. That blocked
manifest remains at:
```text
data/manifests/upload_archive_sample_manifest.json
```
After `rclone` was configured as `/usr/bin/rclone` with remote `gdrive:`, a dry
run and one tiny real upload were run against:
```text
gdrive:orderbooks/polymarket/checkpoint7-test
```
The real upload manifest records `rclone copy` exit code 0 and `rclone check`
exit code 0:
```text
data/manifests/upload_archive_real_test_manifest.json
```
Current gate:
```text
PASS
```
## What Remains Unproven
- Long-run upload reliability.
- Interaction between hourly uploads and a 24h collector soak test.
- Retention cleanup after verified upload.
- Production readiness.

View file

@ -0,0 +1,148 @@
# Kubernetes Deployment
Status: draft runtime package for Checkpoint 8G
This document describes the Kubernetes package for the Polymarket raw
order-book collector. It follows the shared Hetzner k3s cluster model from
`../nuri/unrip3`: application code, Dockerfile, manifests, and Forgejo workflow
live in this repository; platform services, the shared registry, and the shared
Forgejo runner remain platform-owned.
This package does not claim production readiness. Production readiness still
requires a real Kubernetes runtime smoke run with preserved evidence.
## Cluster Decisions
- Namespace: `orderbooks`
- Workstation kubeconfig for validation: `../nuri/unrip3/.state/hetzner/kubeconfig.yaml`
- Shared registry and shared Forgejo runner
- Existing rclone Secret: `orderbooks/orderbooks-rclone-config`
- Secret key mounted by the uploader: `rclone.conf`
Do not commit or print rclone config contents.
## Runtime Layout
The collector and uploader share one PVC:
```text
PVC: orderbooks-data
mount: /var/lib/orderbooks
raw files: /var/lib/orderbooks/raw_orderbooks
manifests: /var/lib/orderbooks/manifests
discovery: /var/lib/orderbooks/discovery
```
The collector uses one Deployment with one replica. The container runs
`/app/scripts/run_polymarket_collector_loop.sh`, which repeatedly executes the
existing bounded collector cycle and records loop failure/interruption manifests
instead of relying on Kubernetes crash loops for normal operation.
The uploader uses one CronJob. It runs the existing rclone uploader in execute
mode, mounts the same PVC, mounts `orderbooks-rclone-config` read-only at
`/etc/rclone/rclone.conf`, sets `RCLONE_CONFIG` to that file, and uploads only
closed/aged files.
## Bootstrap This App Repo
Run the orderbooks-specific bootstrap from this repository:
```sh
scripts/deploy/bootstrap_orderbooks_k8s.sh
```
The bootstrap loads platform defaults and resolved secrets from the local
platform state without printing secret values. It ensures namespace `orderbooks`,
creates or updates `orderbooks-registry-creds`, verifies the existing
`orderbooks-rclone-config` secret has key `rclone.conf`, creates or updates the
Forgejo repo `philipp/orderbooks`, and upserts the required Actions secret and
variables.
After bootstrap, push a clean source tree to Forgejo `main`. Do not push local
`data/`, `artifacts/`, `reports/`, `orchestration/`, kubeconfigs, rclone config,
`.env`, private keys, or other local evidence/secrets.
## Image Build And Deploy
The Forgejo workflow is `.forgejo/workflows/deploy.yml`. It follows the shared
runner pattern:
1. load `KUBECONFIG_B64` from Forgejo secrets;
2. clone this repo inside the runner;
3. create an in-cluster Kaniko Job;
4. build and push `REGISTRY_HOST/orderbooks:<git-sha>`;
5. apply `deploy/k8s/base` with the built image;
6. wait for `deployment/orderbooks-collector` rollout.
Required Forgejo repo secret:
```text
KUBECONFIG_B64
```
Required Forgejo repo variable:
```text
REGISTRY_HOST
```
Project defaults used by the workflow:
```text
PROJECT_NAME=orderbooks
PROJECT_NAMESPACE=orderbooks
PROJECT_DEPLOYMENTS=orderbooks-collector
PROJECT_REGISTRY_SECRET_NAME=orderbooks-registry-creds
```
The registry pull/build secret `orderbooks-registry-creds` must exist in the
`orderbooks` namespace before the workflow builds and deploys.
## Pre-Deploy Validation
From this repository:
```sh
bash -n scripts/run_polymarket_collector_loop.sh
bash -n scripts/k8s_runtime_smoke_check.sh
kubectl kustomize deploy/k8s/base
KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml kubectl apply -k deploy/k8s/base --dry-run=server
KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml kubectl -n orderbooks get secret orderbooks-rclone-config -o go-template='{{if index .data "rclone.conf"}}rclone_secret_key_present{{else}}rclone_secret_key_missing{{end}}{{"\n"}}'
```
The last command checks only whether the key exists. It must not print secret
data.
## Runtime Smoke Gate
After the image is built and the workload is actually deployed, run:
```sh
KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml scripts/k8s_runtime_smoke_check.sh --namespace orderbooks --deployment orderbooks-collector --cronjob orderbooks-uploader --raw-dir /var/lib/orderbooks/raw_orderbooks --manifest-dir /var/lib/orderbooks/manifests --wait-seconds 1800 \
--upload-min-age-seconds 600
```
The smoke gate uses `kubectl`, not systemd. It writes local JSON evidence under
`data/manifests/k8s_runtime_smoke_<UTC_TIMESTAMP>.json` by default. It verifies:
- collector pod is running;
- latest collector manifest has `gate_status: PASS`, `rows_written > 0`, and
`failure_count: 0`;
- raw gzip JSONL parses and is under `/var/lib/orderbooks/raw_orderbooks`;
- deleting the collector pod does not corrupt the old raw file checksum or row
count;
- a later post-restart collector cycle writes valid rows;
- an uploader Job created from the CronJob completes;
- the latest upload manifest records a verified rclone upload with at least one
verified file.
A failed smoke run still writes JSON evidence and exits nonzero. Preserve failed
manifests, raw files, upload manifests, and pod logs for review.
## Not Included
- No trading, signing, wallets, private keys, or API keys.
- No dashboard, database, strategy, backtest, or second-market connector.
- No websocket rewrite.
- No rclone config contents in this repository.

104
docs/METHODOLOGY.md Normal file
View file

@ -0,0 +1,104 @@
# Methodology
This project uses checkpoint-driven compound engineering. The point is to preserve useful data and operational learning, not to accumulate scaffolding.
## Checkpoint Cycle
Every checkpoint follows the same loop:
1. Define the smallest useful checkpoint.
2. Build only what is required for that checkpoint.
3. Validate with real commands and real data when applicable.
4. Write durable artifacts: code or docs, config or run instructions, manifest/report, and validation evidence.
5. State `PASS`, `FAIL`, or `BLOCKED`.
6. Identify the strongest fake-progress risk.
7. Recommend the next smallest step.
8. Stop and ask only when a real decision is needed.
## Gate States
- `PASS`: the checkpoint pass condition is met and evidence is on disk.
- `FAIL`: the checkpoint was attempted but did not meet its pass condition.
- `BLOCKED`: work cannot continue without a decision, credential, service, or unavailable dependency.
- `PARTIAL`: useful artifacts exist, but the checkpoint should not be treated as passed.
## Evidence Rules
- Evidence must be reproducible from files and commands, not just chat.
- If a command was used to validate behavior, record the command and summarize the result in a report or manifest.
- If data was collected, preserve raw data and include checksums.
- If synthetic or sample data is used, label it explicitly.
- If a claim depends on a public endpoint, record the endpoint, request parameters, response fields, status codes, timestamps, and fetch time.
- Do not claim reliability from a short sample run. Reliability requires the roadmap soak test.
## Machine-Readable Manifest Format
Checkpoint manifests should be JSON and stored under `data/manifests/`. Use this shape unless a later checkpoint documents a better schema:
```json
{
"checkpoint_id": 1,
"checkpoint_name": "Project Scaffold And Methodology",
"status": "PASS",
"started_at_utc": "2026-04-14T20:53:49Z",
"ended_at_utc": "2026-04-14T20:53:49Z",
"scope": "Durable project rules and roadmap only; no collector implementation.",
"artifacts": [
{
"path": "AGENTS.md",
"kind": "project_rules",
"status": "valid"
}
],
"validation": {
"commands": [
{
"command": "git status --short",
"result": "completed"
}
],
"summary": "Required files exist and contain checkpoint rules."
},
"decisions": [],
"assumptions": [],
"fake_progress_risk": "Most progress is documentation until public Polymarket endpoint behavior is proven.",
"next_step": "Run Checkpoint 2 public source probe."
}
```
## Markdown Checkpoint Report Format
Checkpoint reports should be stored under `reports/checkpoints/` and include:
- active checkpoint
- scope
- files created or changed
- validation commands and results
- project rules or operational lessons added
- pass/fail/gate
- strongest fake-progress risk
- next smallest step
## Deprecated Or Misleading Artifacts
Do not delete mistakes. Preserve the original artifact and label it.
Preferred labels:
- Add a manifest entry with `status: "deprecated"` or `status: "invalid"`.
- Add a sibling note named `<artifact>.deprecated.md` or `<artifact>.invalid.md` when a human explanation is useful.
- Include why the artifact is wrong, when it was labeled, who labeled it, and what replaces it.
If an artifact is dangerous because it contains secrets, stop and ask the user. Do not spread or copy the secret into reports.
## Anti-Fake-Progress Rules
- No dashboard before 24h data reliability.
- No database before plain compressed files become painful.
- No strategy, backtest, optimizer, or trading bot code.
- No private-key or signing code.
- No generic multi-market abstraction before a second market exists.
- No "production-ready" claim before a 24h soak test.
- No endpoint assumptions without probe evidence.
- No normalized dataset that cannot trace back to raw records.

93
docs/OPERATIONS.md Normal file
View file

@ -0,0 +1,93 @@
# Operations
This document defines operational rules before the collector exists. It should be updated with exact commands as checkpoints add scripts, services, and upload jobs.
## Current Operational Status
- Collector implementation: not started.
- Supported market: none yet; Polymarket is the first planned market.
- Deployment target: small VPS.
- Offload target: Google Drive through `rclone`.
- Reliability status: not production-ready until a documented 24h soak test passes.
## Safety Rules
- No trading.
- No order placement.
- No wallet signing.
- No private keys.
- No secrets in git.
- No dashboards, databases, ML, or strategy code before the roadmap gate allows them.
## Local Runtime Principles
Future scripts should:
- accept a configurable data directory
- write logs to a predictable location
- write raw gzip JSONL snapshots
- rotate files by hour or run
- close files cleanly on shutdown
- write manifests after runs
- avoid corrupting closed files on restart
- handle public endpoint errors and rate limits conservatively
## VPS Deployment Principles
Checkpoint 6 should document:
- Python version and virtualenv setup
- package installation
- environment variables
- systemd or Docker Compose runtime
- service user and file permissions
- data directory ownership
- log locations
- restart policy
- disk usage checks
- safe upgrade and rollback steps
## Google Drive Offload Principles
Checkpoint 7 should use `rclone` and must:
- avoid hardcoded credentials
- upload only closed or rotated files
- support dry-run mode
- verify upload success
- preserve local files until upload is verified
- maintain checksums
- keep the last N days locally
- write an upload manifest
## Incident And Bad-Data Handling
If data looks wrong:
1. Preserve the raw files.
2. Stop relying on the affected derived files.
3. Label the artifact `invalid` or `deprecated`.
4. Write a short note explaining the issue and replacement, if any.
5. Keep the learning in docs or reports.
Examples of bad-data conditions:
- endpoint returned a schema different from expected
- token/outcome mapping was wrong
- timestamps were misunderstood
- rate limits caused large gaps
- gzip file was not closed cleanly
- upload succeeded but checksum did not match
## Minimum Reliability Claim
A short sample run can prove that code writes files. It cannot prove 24/7 reliability.
The project may only claim production readiness after:
- discovery works
- raw order-book collection works
- offload works
- 24h soak test completes
- data quality and gap metrics are documented

102
docs/ORDERBOOK_SCHEMA.md Normal file
View file

@ -0,0 +1,102 @@
# Orderbook Snapshot Schema
Status: valid
This document covers the Checkpoint 5 normalized order-book sample. The raw
gzip JSONL files remain the source of truth. Normalized rows are derived records
for quick inspection and later quality checks.
## Normalized Snapshot
Schema name: `normalized_orderbook_snapshot`
Schema version: `1`
File format: gzip JSONL, one JSON object per line.
Sample location:
```text
data/normalized_sample/polymarket/orderbooks/<run_id>/polymarket_orderbooks_normalized_<run_id>.jsonl.gz
```
Every normalized row must reference exactly one raw gzip JSONL source row:
- `raw_file`: repository-relative path to the raw gzip JSONL file.
- `raw_line_number`: 1-based line number inside that raw gzip JSONL file.
Derived data is invalid if either lineage field is missing or points to a
missing raw file.
## Field Contract
| Field | Type | Meaning |
| --- | --- | --- |
| `schema_name` | string | Always `normalized_orderbook_snapshot`. |
| `schema_version` | number | Schema version, currently `1`. |
| `market_name` | string | Market source name from the raw envelope. |
| `market_slug` | string | Polymarket market slug from the raw envelope. |
| `condition_id` | string | Polymarket condition ID from the raw envelope. |
| `token_id` | string | Polymarket CLOB token ID from the raw envelope. |
| `outcome` | string | Outcome label associated with `token_id`. |
| `collected_at_utc` | string | Collector timestamp from the raw envelope. |
| `best_bid` | string or null | Maximum bid price, or null when no bids exist. |
| `best_ask` | string or null | Minimum ask price, or null when no asks exist. |
| `spread` | string or null | `best_ask - best_bid` when both sides exist. |
| `midpoint` | string or null | `(best_bid + best_ask) / 2` when both sides exist. |
| `bid_depth_total` | string | Sum of all bid sizes. |
| `ask_depth_total` | string | Sum of all ask sizes. |
| `bid_depth_within_1c` | string | Sum of bid sizes priced at least `best_bid - 0.01`. |
| `ask_depth_within_1c` | string | Sum of ask sizes priced at most `best_ask + 0.01`. |
| `bid_depth_within_2c` | string | Sum of bid sizes priced at least `best_bid - 0.02`. |
| `ask_depth_within_2c` | string | Sum of ask sizes priced at most `best_ask + 0.02`. |
| `bid_depth_within_5c` | string | Sum of bid sizes priced at least `best_bid - 0.05`. |
| `ask_depth_within_5c` | string | Sum of ask sizes priced at most `best_ask + 0.05`. |
| `raw_file` | string | Repository-relative raw gzip JSONL path. |
| `raw_line_number` | number | 1-based source line number in `raw_file`. |
## Numeric Encoding
Prices and sizes are parsed with Python `Decimal`. Derived numeric values are
emitted as exact decimal strings rather than JSON numbers. This keeps precision
visible and avoids binary floating-point rounding.
Missing price-derived values are emitted as `null`. Depth totals and depth bands
are emitted as decimal strings and use `"0"` when the relevant side is empty.
## Calculation Rules
- `best_bid`: maximum bid price.
- `best_ask`: minimum ask price.
- `spread`: `best_ask - best_bid` when both sides exist.
- `midpoint`: `(best_bid + best_ask) / 2` when both sides exist.
- `bid_depth_total`: sum of all bid sizes.
- `ask_depth_total`: sum of all ask sizes.
- `bid_depth_within_1c`: sum bid sizes with price greater than or equal to
`best_bid - 0.01`.
- `ask_depth_within_1c`: sum ask sizes with price less than or equal to
`best_ask + 0.01`.
- The same band rule is used for `0.02` and `0.05`.
## Sanity Rules
A normalized file should pass these checks:
- Output row count equals raw input row count unless skipped rows are recorded.
- Every row has `raw_file` and `raw_line_number`.
- Every referenced raw file exists.
- `spread` is non-negative whenever both sides exist.
- `midpoint` is between `best_bid` and `best_ask` whenever both sides exist.
- Depth totals and band depths are non-negative.
- At least one `Up` row and one `Down` row exist in the sample.
- The gzip JSONL file decompresses and every line parses as JSON.
- The manifest checksum matches the normalized output file.
## Current Known Gaps
- This schema covers a derived sample extract only.
- It does not define sustained daily normalized partitions.
- It does not include upload, daemon runtime, dashboards, databases, strategy
code, backtests, trading behavior, or wallet behavior.
- Long-run schema stability still depends on future collection and soak-test
evidence.

View file

@ -0,0 +1,149 @@
# Polymarket Collector
Artifact status: `valid`
## Scope
This document covers the Checkpoint 4 bounded raw order-book sample collector.
It does not describe a production service. It does not include normalization, upload, systemd, dashboards, databases, strategies, trading, wallet logic, private keys, API keys, or private endpoints.
## Inputs
The collector reads active BTC markets from:
```text
data/discovery/polymarket_btc_markets_latest.json
```
Checkpoint 3 writes normalized market records with `condition_id` and `tokens` preserving the `Up` and `Down` outcome-token mapping. The collector uses only those records and does not perform market discovery itself.
If the discovery file is stale or contains no usable active markets, run:
```sh
python3 scripts/discover_polymarket_btc_markets.py
```
## Endpoint
The sample uses the public CLOB batch order-book endpoint:
```text
POST https://clob.polymarket.com/books
```
Request body shape:
```json
[
{"token_id": "<up_token_id>"},
{"token_id": "<down_token_id>"}
]
```
No authentication is used.
## Running A Bounded Sample
Default sample command:
```sh
python3 scripts/collect_polymarket_orderbooks.py
```
The default config is:
```text
config/polymarket_collector.example.yaml
```
The example config is deliberately small:
- `market_limit: 2`
- `interval_seconds: 30`
- `duration_seconds: 300`
- `market_end_safety_seconds: 420`
This produces a 5-minute sample for at most 2 markets, fetching both `Up` and `Down` outcome tokens by batch request.
## Outputs
Raw gzip JSONL snapshots are written under:
```text
data/live_sample/polymarket/orderbooks/<run_id>/
```
The sample manifest is written to:
```text
data/manifests/orderbook_collector_sample_manifest.json
```
Files rotate by run for this checkpoint. Hourly rotation is intentionally left for a later sustained runtime checkpoint.
## Raw JSONL Envelope
Each gzip JSONL line is a raw-first envelope:
```json
{
"schema_name": "raw_orderbook_snapshot",
"schema_version": 1,
"collector": {
"name": "polymarket_orderbook_collector",
"version": "0.1.0"
},
"market": {
"market_name": "polymarket",
"market_slug": "example",
"condition_id": "0x...",
"token_id": "123",
"outcome": "Up",
"market_end_time_utc": "2026-04-14T22:00:00Z"
},
"collection": {
"collected_at_utc": "2026-04-14T21:00:00Z",
"sequence": 1,
"response_index": 0
},
"request": {
"method": "POST",
"url": "https://clob.polymarket.com/books",
"params": null,
"json_body": [{"token_id": "123"}],
"status_code": 200,
"duration_ms": 123,
"attempts": []
},
"raw": {}
}
```
The `raw` object is the unmodified order-book object returned by CLOB for that token.
## Rate-Limit Handling
The sample is conservative:
- Uses a small market cap by default.
- Uses a fixed interval between batch requests.
- Applies request timeout.
- Retries `429` and `5xx` responses with exponential backoff.
- Does not use concurrent requests.
## Shutdown
`SIGINT` and `SIGTERM` set a stop flag. The current request, if any, finishes or times out, the gzip file closes, and the manifest is written with a shutdown warning.
## Known Gaps
- This is a short run-rotated sample, not a daemon.
- It does not prove 24/7 reliability.
- It does not implement hourly rotation.
- It does not refresh discovery during a run.
- It does not normalize snapshots.
- It does not upload files.
- It does not use websockets.
The project must not claim production readiness until the later 24h soak test passes with documented quality metrics.

View file

@ -0,0 +1,54 @@
# Production Definition Of Done
Status: ACTIVE
Defined at UTC: 2026-04-17T09:12:02Z
This project is done for the first production milestone only when it is reliably
collecting Polymarket BTC order-book data on a small VPS with evidence on disk.
Packaging, docs, local samples, and local soak tests are useful evidence, but
not the finish line.
## Done Means
1. The collector runs on the VPS under systemd using `/opt/orderbooks` for code
and `/var/lib/orderbooks` for data.
2. Raw gzip JSONL order-book snapshots are written for active BTC up/down
markets, with manifests beside them.
3. The service survives a forced restart: after restart, a later collection
cycle writes valid raw rows without corrupting prior files.
4. Temporary network/API failure is handled as an operational failure, not data
loss: failures are visible in logs/manifests, and the next successful cycle
resumes writing new files.
5. Google Drive upload runs from the VPS through `rclone`, verifies success, and
leaves local files in place until upload is confirmed.
6. A final production report and machine-readable manifest record exact commands,
timestamps, files, checksums, restart result, upload result, and remaining
risks.
## Not Required For This Milestone
- No second market.
- No dashboard.
- No database.
- No strategy or backtest code.
- No websocket rewrite unless polling proves insufficient.
- No generic multi-market abstraction.
## Maximum Remaining Builder Turns
The remaining work is capped at three builder turns:
1. Accept deploy bundle and prepare the minimal VPS reliability gate.
2. Execute or guide the VPS cutover and collect runtime evidence.
3. Fix only blocking production issues found by the VPS gate, then write the
final pass/fail report.
If actual VPS access is unavailable, the gate must be `BLOCKED_NEEDS_VPS_ACCESS`,
not production ready.
## Current Evidence
- Deploy bundle gate: `DEPLOY_BUNDLE_READY`.
- Local 24h soak final manifest exists but remains `NEEDS_REVIEW`.
- Production readiness remains false until VPS runtime evidence exists.

341
docs/VPS_CUTOVER_RUNBOOK.md Normal file
View file

@ -0,0 +1,341 @@
# VPS Cutover Runbook
Status: valid
Checkpoint 8 status is `WAIVED_BY_USER`, not `PASS`. This runbook prepares a
VPS cutover for the existing Polymarket raw order-book collector only. It does
not claim production readiness, second-market support, dashboards, databases,
strategies, or trading.
## Scope
Included:
- VPS prerequisite checks.
- Repository copy/update steps.
- Public Polymarket collector service install.
- Google Drive offload timer install with rclone.
- Liveness, cycle health, and upload verification commands.
- Rollback and stop commands.
Excluded:
- Private API access.
- Wallets, keys, mnemonics, signing, order placement, or trading.
- Database, dashboard, strategy, or second-market work.
## Recommended VPS Layout
Use the existing package paths unless the VPS has a reason to differ:
```text
repository: /opt/orderbooks
python virtualenv: /opt/orderbooks/.venv
config: /etc/orderbooks/polymarket_collector.vps.yaml
collector env: /etc/orderbooks/polymarket-orderbook-collector.env
uploader env: /etc/orderbooks/orderbook-uploader.env
data root: /var/lib/orderbooks
raw files: /var/lib/orderbooks/raw_orderbooks
manifests: /var/lib/orderbooks/manifests
discovery: /var/lib/orderbooks/discovery
```
The `orderbooks` system user should own `/var/lib/orderbooks`. The repository
under `/opt/orderbooks` can be root-owned and world-readable.
## VPS Prerequisites
On Ubuntu or Debian:
```sh
sudo apt-get update
sudo apt-get install -y git python3 python3-venv rclone
sudo useradd --system --home /var/lib/orderbooks --shell /usr/sbin/nologin orderbooks || true
sudo mkdir -p /opt /etc/orderbooks /var/lib/orderbooks/discovery /var/lib/orderbooks/raw_orderbooks /var/lib/orderbooks/manifests /var/log/orderbooks
sudo chown -R orderbooks:orderbooks /var/lib/orderbooks /var/log/orderbooks
```
No API keys, private keys, mnemonics, wallets, or trading credentials are
required by this project. rclone credentials are the only machine-local
credential material expected for Google Drive offload, and they must stay
outside the repository.
## Copy Or Update The Repository
First install:
```sh
cd /opt
sudo git clone <repo-url> orderbooks
```
Update an existing checkout:
```sh
cd /opt/orderbooks
sudo git fetch --all --prune
sudo git pull --ff-only
```
Prepare repository permissions and the Python virtualenv:
```sh
cd /opt/orderbooks
sudo chmod +x scripts/run_polymarket_collector_cycle.sh scripts/upload_archive_rclone.sh scripts/vps_preflight_check.sh scripts/vps_runtime_smoke_check.sh
sudo python3 -m venv .venv
sudo .venv/bin/python -m pip install --upgrade pip
sudo chown -R root:root /opt/orderbooks
sudo chmod -R a+rX /opt/orderbooks
```
The current collector scripts use the Python standard library.
## Configure Public Collector Runtime
Install the example config, then review it:
```sh
sudo install -o root -g root -m 0644 /opt/orderbooks/config/polymarket_collector.vps.example.yaml /etc/orderbooks/polymarket_collector.vps.yaml
sudo editor /etc/orderbooks/polymarket_collector.vps.yaml
```
Optional collector env overrides:
```sh
sudo install -o root -g orderbooks -m 0640 /dev/null /etc/orderbooks/polymarket-orderbook-collector.env
sudo editor /etc/orderbooks/polymarket-orderbook-collector.env
```
Example values:
```text
ORDERBOOKS_DATA_DIR=/var/lib/orderbooks
ORDERBOOKS_OUTPUT_DIR=/var/lib/orderbooks/raw_orderbooks
ORDERBOOKS_DISCOVERY_MAX_PAGES=3
```
## Configure Rclone
Configure rclone as the `orderbooks` user. Do not print or commit
`rclone.conf`.
```sh
sudo -u orderbooks rclone config
sudo -u orderbooks rclone listremotes
sudo -u orderbooks rclone lsf gdrive: --max-depth 1
```
Create the uploader env file:
```sh
sudo install -o root -g orderbooks -m 0640 /dev/null /etc/orderbooks/orderbook-uploader.env
sudo editor /etc/orderbooks/orderbook-uploader.env
```
Example:
```text
ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
ORDERBOOKS_RCLONE_BIN=/usr/bin/rclone
ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600
```
The uploader verifies uploads with `rclone check`. Dry runs do not prove remote
write access.
## Run VPS Preflight
Run the preflight before installing or starting services:
```sh
cd /opt/orderbooks
sudo -u orderbooks /opt/orderbooks/scripts/vps_preflight_check.sh \
--app-dir /opt/orderbooks \
--python-bin /opt/orderbooks/.venv/bin/python \
--rclone-bin /usr/bin/rclone \
--rclone-remote gdrive:orderbooks/polymarket \
--data-dir /var/lib/orderbooks \
--manifest-dir /var/lib/orderbooks/manifests \
--log-dir /var/log/orderbooks \
--min-free-gib 5
```
The preflight does not print rclone configuration. It checks repository files,
Python compilation, shell syntax, systemd unit parsing when available, rclone
availability, optional remote readability, target directory writability, disk
space, and the absence of required project secrets.
## Install Systemd Units
Install collector and uploader units:
```sh
sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-collector.service /etc/systemd/system/polymarket-orderbook-collector.service
sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.service /etc/systemd/system/polymarket-orderbook-uploader.service
sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.timer /etc/systemd/system/polymarket-orderbook-uploader.timer
sudo systemctl daemon-reload
sudo systemd-analyze verify /etc/systemd/system/polymarket-orderbook-collector.service /etc/systemd/system/polymarket-orderbook-uploader.service /etc/systemd/system/polymarket-orderbook-uploader.timer
```
Enable and start:
```sh
sudo systemctl enable --now polymarket-orderbook-collector.service
sudo systemctl enable --now polymarket-orderbook-uploader.timer
```
Run one uploader cycle immediately after the collector has produced closed raw
files:
```sh
sudo systemctl start polymarket-orderbook-uploader.service
```
Run the minimal runtime reliability smoke gate after both units are installed,
rclone is configured, and at least one closed raw file is older than the
uploader minimum age (default: 600 seconds):
```sh
sudo /opt/orderbooks/scripts/vps_runtime_smoke_check.sh \
--app-dir /opt/orderbooks \
--data-dir /var/lib/orderbooks \
--raw-dir /var/lib/orderbooks/raw_orderbooks \
--manifest-dir /var/lib/orderbooks/manifests \
--collector-service polymarket-orderbook-collector.service \
--uploader-service polymarket-orderbook-uploader.service \
--wait-seconds 900
```
This command is the minimal production reliability gate. It records a JSON
evidence manifest under `/var/lib/orderbooks/manifests/`, verifies a valid
collector cycle, forces one collector service restart, verifies the prior raw
gzip file still parses with the same checksum, waits for a later valid cycle,
starts the uploader, and records upload success or failure evidence. Preserve
failed smoke manifests and journal logs for review.
## Check Liveness
Collector service:
```sh
sudo systemctl status polymarket-orderbook-collector.service
sudo journalctl -u polymarket-orderbook-collector.service --since "30 minutes ago"
```
Uploader timer and service:
```sh
sudo systemctl list-timers polymarket-orderbook-uploader.timer
sudo systemctl status polymarket-orderbook-uploader.service
sudo journalctl -u polymarket-orderbook-uploader.service --since "2 hours ago"
```
Recent artifacts:
```sh
find /var/lib/orderbooks/raw_orderbooks -type f -name '*.jsonl.gz' -printf '%TY-%Tm-%TdT%TH:%TM:%TS %s %p\n' | sort | tail
find /var/lib/orderbooks/manifests -type f -name '*.json' -printf '%TY-%Tm-%TdT%TH:%TM:%TS %s %p\n' | sort | tail
```
## Check Latest Cycle Health
Inspect the newest collector manifest:
```sh
latest_collector="$(find /var/lib/orderbooks/manifests -type f -name 'polymarket_orderbook_collector_*.json' | sort | tail -n 1)"
python3 -m json.tool "$latest_collector" | sed -n '1,180p'
```
Minimum healthy signs:
```text
gate_status: PASS
rows_written: greater than 0
failure_count: 0
failures: []
```
Verify the latest raw gzip parses and row count matches its manifest:
```sh
python3 - "$latest_collector" <<'PY'
import gzip
import json
import sys
from pathlib import Path
manifest = json.loads(Path(sys.argv[1]).read_text())
for item in manifest.get("output_files", []):
path = Path(item["path"])
rows = 0
with gzip.open(path, "rt", encoding="utf-8") as handle:
for line in handle:
if line.strip():
json.loads(line)
rows += 1
print({"path": str(path), "rows": rows, "manifest_rows": item.get("rows"), "matches": rows == item.get("rows")})
PY
```
## Verify Uploads
Inspect the newest upload manifest:
```sh
latest_upload="$(find /var/lib/orderbooks/manifests -type f -name 'upload_archive_*.json' | sort | tail -n 1)"
python3 -m json.tool "$latest_upload" | sed -n '1,220p'
```
Minimum healthy signs:
```text
operation_status: UPLOAD_VERIFIED
gate_status: PASS
rclone.copy_exit_code: 0
rclone.check_exit_code: 0
counts.uploaded equals counts.verified
```
Manual remote spot-check without printing config:
```sh
sudo -u orderbooks rclone lsf "$ORDERBOOKS_RCLONE_DEST" --max-depth 2 | head
```
## Rollback Or Stop
Stop uploader timer first:
```sh
sudo systemctl disable --now polymarket-orderbook-uploader.timer
sudo systemctl stop polymarket-orderbook-uploader.service
```
Stop collector:
```sh
sudo systemctl stop polymarket-orderbook-collector.service
```
Disable collector if needed:
```sh
sudo systemctl disable polymarket-orderbook-collector.service
```
Preserve `/var/lib/orderbooks` and `/var/lib/orderbooks/manifests` for evidence.
If an artifact is wrong, label it as invalid or deprecated in a sibling note
rather than deleting it.
## Still Not Production Proven
Because the domestic 24h soak wait was waived by the user, the following remain
unproven:
- A completed 24h collector run with reviewed final metrics.
- 24h interaction between collector rotation and uploader timer.
- VPS-specific long-run disk, network, rclone, and systemd behavior.
- Retention cleanup behavior under verified upload load.
Treat this as cutover preparation. The VPS is not deployed until the commands
are run on the VPS and evidence is written.

298
docs/VPS_DEPLOYMENT.md Normal file
View file

@ -0,0 +1,298 @@
# VPS Deployment
Status: valid
This document covers the Checkpoint 6 systemd runtime package for the raw
Polymarket order-book collector.
It does not claim production readiness or 24/7 reliability. That remains gated
on the later 24h soak test.
## Scope
Included:
- systemd service for the raw collector cycle
- Python virtualenv setup
- service user and directory permissions
- configurable data directory
- discovery refresh before each collector cycle
- journal-based logs
- safe restart model for finite collector runs
Excluded:
- Google Drive offload
- `rclone`
- uploader scripts, services, or timers
- normalization changes
- dashboards
- databases
- strategies or backtests
- trading, order placement, signing, or wallet logic
Uploader service and timer units are intentionally deferred to Checkpoint 7.
## Runtime Model
The systemd service runs:
```text
/opt/orderbooks/scripts/run_polymarket_collector_cycle.sh
```
Each cycle:
1. Refreshes BTC market discovery into the configured data directory.
2. Runs `scripts/collect_polymarket_orderbooks.py` once.
3. Writes run-rotated raw gzip JSONL files.
4. Writes a per-cycle collector manifest.
5. Exits after the configured finite duration.
The unit uses `Restart=always`, so systemd starts the next cycle after the prior
cycle exits or fails.
The example config uses a 300 second collection cycle. This is deliberately
short because current BTC up/down markets are short-lived and the collector
refreshes discovery only before a cycle starts. Do not increase the cycle beyond
the practical market horizon unless the collector later learns to refresh market
selection during a run.
## Paths
Default VPS paths:
| Purpose | Path |
| --- | --- |
| Application checkout | `/opt/orderbooks` |
| Python virtualenv | `/opt/orderbooks/.venv` |
| Service config | `/etc/orderbooks/polymarket_collector.vps.yaml` |
| Optional env override file | `/etc/orderbooks/polymarket-orderbook-collector.env` |
| Data directory | `/var/lib/orderbooks` |
| Discovery artifacts | `/var/lib/orderbooks/discovery` |
| Raw order-book output base | `/var/lib/orderbooks/raw_orderbooks` |
| Per-cycle manifests | `/var/lib/orderbooks/manifests` |
Adjust these paths if the repository is installed somewhere other than
`/opt/orderbooks`.
## Environment Variables
The service defines safe defaults and can load overrides from:
```text
/etc/orderbooks/polymarket-orderbook-collector.env
```
Supported variables:
| Variable | Default | Meaning |
| --- | --- | --- |
| `ORDERBOOKS_APP_DIR` | `/opt/orderbooks` | Repository checkout path. |
| `ORDERBOOKS_DATA_DIR` | `/var/lib/orderbooks` | Base directory for data files. |
| `ORDERBOOKS_PYTHON` | `/opt/orderbooks/.venv/bin/python` | Python interpreter. |
| `ORDERBOOKS_COLLECTOR_CONFIG` | `/etc/orderbooks/polymarket_collector.vps.yaml` | Collector config path. |
| `ORDERBOOKS_DISCOVERY_DIR` | `$ORDERBOOKS_DATA_DIR/discovery` | Discovery artifact directory. |
| `ORDERBOOKS_OUTPUT_DIR` | `$ORDERBOOKS_DATA_DIR/raw_orderbooks` | Collector output base directory. |
| `ORDERBOOKS_MANIFEST_DIR` | `$ORDERBOOKS_DATA_DIR/manifests` | Per-cycle manifest directory. |
| `ORDERBOOKS_DISCOVERY_LIMIT` | `100` | Gamma event page limit per discovery page. |
| `ORDERBOOKS_DISCOVERY_MAX_PAGES` | `3` | Discovery page cap per cycle. |
| `ORDERBOOKS_DISCOVERY_TIMEOUT` | `15` | Discovery request timeout in seconds. |
Example override file:
```text
ORDERBOOKS_DATA_DIR=/var/lib/orderbooks
ORDERBOOKS_DISCOVERY_MAX_PAGES=3
```
No API keys are required for this checkpoint.
## Install On Ubuntu Or Debian
Run package and account setup as root or with `sudo`:
```sh
sudo apt-get update
sudo apt-get install -y git python3 python3-venv
sudo useradd --system --home /var/lib/orderbooks --shell /usr/sbin/nologin orderbooks
sudo mkdir -p /opt /etc/orderbooks /var/lib/orderbooks/discovery /var/lib/orderbooks/raw_orderbooks /var/lib/orderbooks/manifests
```
Install or update the repository under `/opt/orderbooks`. One option is:
```sh
cd /opt
sudo git clone <repo-url> orderbooks
```
If the checkout already exists:
```sh
cd /opt/orderbooks
sudo git pull --ff-only
```
Prepare permissions:
```sh
sudo chown -R root:root /opt/orderbooks
sudo chmod -R a+rX /opt/orderbooks
sudo chmod +x /opt/orderbooks/scripts/run_polymarket_collector_cycle.sh
sudo chown -R orderbooks:orderbooks /var/lib/orderbooks
```
Create the virtualenv:
```sh
cd /opt/orderbooks
sudo python3 -m venv .venv
sudo .venv/bin/python -m pip install --upgrade pip
sudo chown -R root:root .venv
sudo chmod -R a+rX .venv
```
The current Checkpoint 6 scripts use only the Python standard library.
Install the VPS config and service unit:
```sh
sudo install -o root -g root -m 0644 /opt/orderbooks/config/polymarket_collector.vps.example.yaml /etc/orderbooks/polymarket_collector.vps.yaml
sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-collector.service /etc/systemd/system/polymarket-orderbook-collector.service
```
Review `/etc/orderbooks/polymarket_collector.vps.yaml` before starting the
service. The example writes under `/var/lib/orderbooks`.
Enable and start:
```sh
sudo systemctl daemon-reload
sudo systemctl enable --now polymarket-orderbook-collector.service
```
## Logs And Status
Use the systemd journal:
```sh
sudo systemctl status polymarket-orderbook-collector.service
sudo journalctl -u polymarket-orderbook-collector.service -f
```
Recent logs without following:
```sh
sudo journalctl -u polymarket-orderbook-collector.service --since "1 hour ago"
```
## Output Files
Raw gzip JSONL files are written under:
```text
/var/lib/orderbooks/raw_orderbooks/polymarket/orderbooks/<run_id>/
```
Per-cycle manifests are written under:
```text
/var/lib/orderbooks/manifests/polymarket_orderbook_collector_<cycle_id>.json
```
Discovery artifacts are refreshed under:
```text
/var/lib/orderbooks/discovery/
```
## Restart And Stop Behavior
The unit uses:
```text
Restart=always
RestartSec=30s
TimeoutStopSec=90s
KillSignal=SIGTERM
KillMode=control-group
```
The collector handles `SIGTERM` by finishing or timing out the current request,
closing the gzip output, and writing the manifest. Every cycle writes to a new
run directory, so closed files are not reopened by the next cycle.
Stop the service with:
```sh
sudo systemctl stop polymarket-orderbook-collector.service
```
Start it again with:
```sh
sudo systemctl start polymarket-orderbook-collector.service
```
## Local Validation Without Starting The Service
These checks do not require root:
```sh
python3 -m py_compile scripts/discover_polymarket_btc_markets.py scripts/collect_polymarket_orderbooks.py
bash -n scripts/run_polymarket_collector_cycle.sh
python3 - <<'PY'
from pathlib import Path
from scripts.collect_polymarket_orderbooks import load_flat_yaml
cfg = load_flat_yaml(Path('config/polymarket_collector.vps.example.yaml'))
required = {
'discovery_path',
'output_dir',
'manifest_path',
'market_limit',
'interval_seconds',
'duration_seconds',
}
missing = sorted(required - set(cfg))
assert not missing, missing
assert cfg['duration_seconds'] > 0
print('config parse ok')
PY
```
If systemd tools are available locally:
```sh
systemd-analyze verify systemd/polymarket-orderbook-collector.service
```
The local machine may not have `/opt/orderbooks` or the `orderbooks` service
user. Treat missing VPS path or user messages as deployment-environment warnings,
not collector syntax failures.
## Safe Upgrade
Stop the service, update files, rerun validation, then start the service:
```sh
sudo systemctl stop polymarket-orderbook-collector.service
cd /opt/orderbooks
sudo git pull --ff-only
sudo .venv/bin/python -m py_compile scripts/discover_polymarket_btc_markets.py scripts/collect_polymarket_orderbooks.py
sudo systemctl daemon-reload
sudo systemctl start polymarket-orderbook-collector.service
```
Do not remove existing data files during an upgrade. If a bad artifact is found,
preserve it and label it invalid or deprecated with a replacement path when one
exists.
## Current Limits
- This package runs the existing raw collector; it does not add a daemon inside
Python.
- The systemd loop is a restart model around finite collector cycles.
- It does not upload files.
- It does not prove long-run reliability.
- Production readiness remains blocked until discovery, raw collection, offload,
and a documented 24h soak test all pass.

View file

@ -0,0 +1,366 @@
#!/usr/bin/env bash
set -euo pipefail
APP_DIR="${ORDERBOOKS_APP_DIR:-$(pwd)}"
OUTPUT_DIR="${ORDERBOOKS_VPS_BUNDLE_OUTPUT_DIR:-artifacts/vps}"
TIMESTAMP="${ORDERBOOKS_VPS_BUNDLE_TIMESTAMP:-$(date -u +%Y%m%dT%H%M%SZ)}"
BUNDLE_BASENAME="orderbooks_vps_deploy_${TIMESTAMP}"
TARBALL="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}.tar.gz"
MANIFEST="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}_manifest.json"
usage() {
cat <<'EOF'
Usage: scripts/build_vps_deploy_bundle.sh [options]
Build a deployable VPS bundle from the current working tree. The bundle is
intended to be copied to a VPS and unpacked under /opt/orderbooks.
Options:
--app-dir DIR Source working tree. Default: ORDERBOOKS_APP_DIR or current directory.
--output-dir DIR Bundle output directory. Default: artifacts/vps.
--timestamp TS Override UTC timestamp used in artifact names.
--help Show this help.
The bundle uses a narrow allowlist and excludes live data, caches, git metadata,
virtualenvs, rclone config, private keys, wallets, mnemonics, and generated
artifacts. It does not print secrets and does not write Python bytecode.
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--app-dir)
APP_DIR="$2"
shift 2
;;
--output-dir)
OUTPUT_DIR="$2"
TARBALL="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}.tar.gz"
MANIFEST="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}_manifest.json"
shift 2
;;
--timestamp)
TIMESTAMP="$2"
BUNDLE_BASENAME="orderbooks_vps_deploy_${TIMESTAMP}"
TARBALL="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}.tar.gz"
MANIFEST="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}_manifest.json"
shift 2
;;
--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
APP_DIR="${APP_DIR%/}"
if [[ ! -d "${APP_DIR}" ]]; then
echo "Source app directory does not exist: ${APP_DIR}" >&2
exit 1
fi
mkdir -p "${OUTPUT_DIR}"
cd "${APP_DIR}"
if [[ -e "${TARBALL}" || -e "${MANIFEST}" ]]; then
echo "Refusing to overwrite existing bundle artifact: ${TARBALL} or ${MANIFEST}" >&2
exit 1
fi
FILELIST="$(mktemp)"
trap 'rm -f "${FILELIST}"' EXIT
PYTHONDONTWRITEBYTECODE=1 python3 - "${FILELIST}" "${MANIFEST}" "${TARBALL}" "${TIMESTAMP}" <<'PY_BUNDLE_SELECT'
import datetime as dt
import fnmatch
import hashlib
import json
import os
import sys
from pathlib import Path
filelist_path = Path(sys.argv[1])
manifest_path = Path(sys.argv[2])
tarball_path = Path(sys.argv[3])
timestamp = sys.argv[4]
root = Path.cwd()
allowed_files = [
Path("AGENTS.md"),
Path("ROADMAP.md"),
]
allowed_dirs = [
Path("config"),
Path("docs"),
Path("scripts"),
Path("systemd"),
Path("reports/checkpoints"),
]
allowed_globs = [
"data/manifests/checkpoint_*.json",
]
excluded_patterns = [
".git/",
".venv/",
"artifacts/",
"data/soak_test/",
"data/live_sample/",
"data/normalized_sample/",
"**/__pycache__/",
"**/*.pyc",
"**/*.pyo",
"**/.pytest_cache/",
"**/.mypy_cache/",
"**/.ruff_cache/",
"**/rclone.conf",
"**/.env",
"**/*.pem",
"**/*.key",
"**/*.p12",
"**/*.pfx",
"**/id_rsa*",
"**/id_ed25519*",
"**/*mnemonic*",
"**/*wallet*",
"**/*credential*",
"**/*secret*",
]
required_files = [
"AGENTS.md",
"ROADMAP.md",
"config/polymarket_collector.vps.example.yaml",
"config/rclone.example.md",
"docs/VPS_CUTOVER_RUNBOOK.md",
"docs/VPS_DEPLOYMENT.md",
"docs/GOOGLE_DRIVE_OFFLOAD.md",
"scripts/build_vps_deploy_bundle.sh",
"scripts/vps_preflight_check.sh",
"scripts/vps_runtime_smoke_check.sh",
"scripts/run_polymarket_collector_cycle.sh",
"scripts/upload_archive_rclone.sh",
"scripts/discover_polymarket_btc_markets.py",
"scripts/collect_polymarket_orderbooks.py",
"scripts/normalize_polymarket_orderbooks.py",
"systemd/polymarket-orderbook-collector.service",
"systemd/polymarket-orderbook-uploader.service",
"systemd/polymarket-orderbook-uploader.timer",
]
forbidden_path_fragments = [
"/.git/",
"/.venv/",
"/__pycache__/",
"/data/soak_test/",
"/data/live_sample/",
"/data/normalized_sample/",
"/artifacts/",
]
forbidden_names = {
"rclone.conf",
".env",
"id_rsa",
"id_ed25519",
}
forbidden_suffixes = {
".pyc",
".pyo",
".pem",
".key",
".p12",
".pfx",
}
secretish_name_tokens = [
"mnemonic",
"wallet",
"credential",
"secret",
]
def as_posix(path: Path) -> str:
return path.as_posix()
def is_forbidden(path: Path) -> tuple[bool, str | None]:
rel = as_posix(path)
wrapped = f"/{rel}/" if path.is_dir() else f"/{rel}"
if path.is_absolute() or ".." in path.parts:
return True, "absolute_or_parent_path"
for fragment in forbidden_path_fragments:
if fragment in wrapped:
return True, f"forbidden_fragment:{fragment}"
if any(part in {".git", ".venv", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache"} for part in path.parts):
return True, "forbidden_cache_or_metadata_dir"
lower_name = path.name.lower()
if lower_name in forbidden_names:
return True, f"forbidden_name:{path.name}"
if path.suffix.lower() in forbidden_suffixes:
return True, f"forbidden_suffix:{path.suffix}"
if any(token in lower_name for token in secretish_name_tokens):
return True, f"secretish_name:{path.name}"
if rel.startswith(("data/soak_test/", "data/live_sample/", "data/normalized_sample/", "artifacts/")):
return True, "forbidden_prefix"
return False, None
def iter_allowed_files():
seen = set()
for path in allowed_files:
if path.is_file() and path not in seen:
seen.add(path)
yield path
for directory in allowed_dirs:
if not directory.exists():
continue
for path in sorted(directory.rglob("*")):
if path.is_file() and path not in seen:
seen.add(path)
yield path
for pattern in allowed_globs:
for path in sorted(root.glob(pattern)):
if path.is_file() and path not in seen:
seen.add(path)
yield path
def sha256_file(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
included = []
excluded = []
for path in iter_allowed_files():
forbidden, reason = is_forbidden(path)
if forbidden:
excluded.append({"path": as_posix(path), "reason": reason})
continue
stat = path.stat()
included.append({
"path": as_posix(path),
"bytes": stat.st_size,
"sha256": sha256_file(path),
})
included_paths = sorted(item["path"] for item in included)
missing_required = sorted(path for path in required_files if path not in included_paths)
if missing_required:
raise SystemExit(f"missing required bundle files: {missing_required}")
if not included:
raise SystemExit("bundle file list is empty")
filelist_path.write_bytes(b"".join(path.encode("utf-8") + b"\0" for path in included_paths))
created_at = dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
manifest = {
"schema_name": "vps_deploy_bundle_manifest",
"schema_version": 1,
"created_at_utc": created_at,
"timestamp": timestamp,
"tarball_path": as_posix(tarball_path),
"manifest_path": as_posix(manifest_path),
"source_root": str(root),
"bundle_intent": "Copy to a VPS and unpack under /opt/orderbooks; VPS execution remains pending.",
"production_ready": False,
"vps_deployed": False,
"included_roots": [str(path) for path in allowed_files + allowed_dirs] + allowed_globs,
"excluded_patterns": excluded_patterns,
"required_files": required_files,
"included_file_count": len(included),
"included_files": included,
"excluded_selected_files": excluded,
"missing_required_files": missing_required,
"validation": {
"required_files_present_before_tar": not missing_required,
"forbidden_paths_absent_before_tar": True,
"tarball_validation_completed": False,
},
}
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
PY_BUNDLE_SELECT
tar --create --gzip --file "${TARBALL}" --null --files-from "${FILELIST}" --owner=0 --group=0 --numeric-owner
PYTHONDONTWRITEBYTECODE=1 python3 - "${TARBALL}" "${MANIFEST}" <<'PY_BUNDLE_VALIDATE'
import hashlib
import json
import sys
import tarfile
from pathlib import Path
tarball_path = Path(sys.argv[1])
manifest_path = Path(sys.argv[2])
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
required_files = set(manifest["required_files"])
def sha256_file(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def forbidden_reason(name: str) -> str | None:
parts = name.split("/")
lower_name = parts[-1].lower()
if name.startswith("/") or any(part == ".." for part in parts):
return "absolute_or_parent_path"
if parts[0] in {".git", ".venv", "artifacts"}:
return f"forbidden_top_level:{parts[0]}"
if len(parts) >= 2 and parts[0] == "data" and parts[1] in {"soak_test", "live_sample", "normalized_sample"}:
return f"forbidden_data_dir:data/{parts[1]}"
if any(part in {".git", ".venv", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache"} for part in parts):
return "forbidden_cache_or_metadata_dir"
if lower_name in {"rclone.conf", ".env", "id_rsa", "id_ed25519"}:
return f"forbidden_name:{lower_name}"
if any(lower_name.endswith(suffix) for suffix in (".pyc", ".pyo", ".pem", ".key", ".p12", ".pfx")):
return "forbidden_suffix"
if any(token in lower_name for token in ("mnemonic", "wallet", "credential", "secret")):
return "secretish_name"
return None
with tarfile.open(tarball_path, "r:gz") as archive:
members = [member for member in archive.getmembers() if member.isfile()]
names = sorted(member.name for member in members)
forbidden = [{"path": name, "reason": forbidden_reason(name)} for name in names if forbidden_reason(name)]
missing_required = sorted(required_files - set(names))
if forbidden or missing_required:
manifest["validation"].update({
"tarball_validation_completed": True,
"forbidden_paths_absent_in_tarball": not forbidden,
"required_files_present_in_tarball": not missing_required,
"forbidden_paths_in_tarball": forbidden,
"missing_required_files_in_tarball": missing_required,
})
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
raise SystemExit(f"bundle validation failed forbidden={forbidden} missing_required={missing_required}")
manifest["tarball_bytes"] = tarball_path.stat().st_size
manifest["tarball_sha256"] = sha256_file(tarball_path)
manifest["tarball_content_count"] = len(names)
manifest["tarball_contents"] = names
manifest["validation"].update({
"tarball_validation_completed": True,
"forbidden_paths_absent_in_tarball": True,
"required_files_present_in_tarball": True,
"forbidden_paths_in_tarball": [],
"missing_required_files_in_tarball": [],
})
manifest["gate_status"] = "PASS"
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
PY_BUNDLE_VALIDATE
printf 'BUNDLE_TARBALL=%s\n' "${TARBALL}"
printf 'BUNDLE_MANIFEST=%s\n' "${MANIFEST}"
python3 - <<'PY_PRINT' "${MANIFEST}"
import json
import sys
from pathlib import Path
m = json.loads(Path(sys.argv[1]).read_text(encoding="utf-8"))
print(f"BUNDLE_SHA256={m['tarball_sha256']}")
print(f"BUNDLE_BYTES={m['tarball_bytes']}")
print(f"BUNDLE_FILE_COUNT={m['tarball_content_count']}")
PY_PRINT

View file

@ -0,0 +1,668 @@
#!/usr/bin/env python3
"""Minimal raw Polymarket order-book snapshot sample collector.
Checkpoint 4 scope: finite sample run only. This script reads the BTC discovery
artifact, fetches public CLOB batch order books for a small market set, writes
raw gzip JSONL envelopes, and closes with a manifest. It is not a daemon and it
does not trade.
"""
from __future__ import annotations
import argparse
import datetime as dt
import gzip
import hashlib
import json
import signal
import sys
import time
import urllib.error
import urllib.request
from pathlib import Path
from typing import Any
COLLECTOR_NAME = "polymarket_orderbook_collector"
COLLECTOR_VERSION = "0.1.0"
SCHEMA_NAME = "raw_orderbook_snapshot"
SCHEMA_VERSION = 1
CLOB_BOOKS_URL = "https://clob.polymarket.com/books"
DEFAULT_CONFIG_PATH = Path("config/polymarket_collector.example.yaml")
DEFAULT_DISCOVERY_PATH = Path("data/discovery/polymarket_btc_markets_latest.json")
DEFAULT_OUTPUT_DIR = Path("data/live_sample")
DEFAULT_MANIFEST_PATH = Path("data/manifests/orderbook_collector_sample_manifest.json")
SAFE_RESPONSE_HEADERS = {
"cache-control",
"cf-cache-status",
"cf-ray",
"content-length",
"content-type",
"date",
"retry-after",
"server",
"x-ratelimit-limit",
"x-ratelimit-remaining",
"x-ratelimit-reset",
"ratelimit-limit",
"ratelimit-remaining",
"ratelimit-reset",
}
STOP_REQUESTED = False
STOP_SIGNAL: str | None = None
def handle_stop(signum: int, _frame: Any) -> None:
global STOP_REQUESTED, STOP_SIGNAL
STOP_REQUESTED = True
STOP_SIGNAL = signal.Signals(signum).name
def utc_now() -> dt.datetime:
return dt.datetime.now(dt.UTC)
def iso_z(value: dt.datetime | None = None) -> str:
value = value or utc_now()
return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
def compact_timestamp(value: dt.datetime | None = None) -> str:
value = value or utc_now()
return value.astimezone(dt.UTC).strftime("%Y%m%dT%H%M%SZ")
def parse_iso(value: Any) -> dt.datetime | None:
if not isinstance(value, str) or not value.strip():
return None
text = value.strip()
if text.endswith("Z"):
text = text[:-1] + "+00:00"
try:
parsed = dt.datetime.fromisoformat(text)
except ValueError:
return None
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=dt.UTC)
return parsed.astimezone(dt.UTC)
def sha256_file(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def parse_scalar(value: str) -> Any:
value = value.strip()
if not value:
return ""
if value[0] in {"'", '"'} and value[-1:] == value[0]:
return value[1:-1]
lower = value.lower()
if lower in {"true", "false"}:
return lower == "true"
if lower in {"null", "none"}:
return None
try:
return int(value)
except ValueError:
pass
try:
return float(value)
except ValueError:
return value
def load_flat_yaml(path: Path) -> dict[str, Any]:
"""Parse the flat YAML subset used by the example config."""
config: dict[str, Any] = {}
if not path.exists():
return config
for line_number, raw_line in enumerate(path.read_text(encoding="utf-8").splitlines(), 1):
line = raw_line.split("#", 1)[0].strip()
if not line:
continue
if ":" not in line:
raise ValueError(f"Unsupported config line {line_number}: {raw_line}")
key, value = line.split(":", 1)
key = key.strip()
if not key:
raise ValueError(f"Missing config key on line {line_number}")
config[key] = parse_scalar(value)
return config
def config_digest(path: Path | None) -> str | None:
if path is None or not path.exists():
return None
return sha256_file(path)
def filter_headers(headers: Any) -> dict[str, str]:
safe: dict[str, str] = {}
for key, value in dict(headers).items():
if key.lower() in SAFE_RESPONSE_HEADERS:
safe[key] = value
return safe
def http_post_json(
*,
url: str,
json_body: Any,
timeout_seconds: float,
max_retries: int,
backoff_seconds: float,
) -> dict[str, Any]:
body_bytes = json.dumps(json_body, separators=(",", ":")).encode("utf-8")
attempts: list[dict[str, Any]] = []
final_json: Any | None = None
final_text_preview: str | None = None
final_json_error: str | None = None
final_status_code: int | None = None
final_headers: dict[str, str] = {}
for attempt_index in range(max_retries + 1):
started_at = iso_z()
started_monotonic = time.monotonic()
status_code: int | None = None
response_headers: dict[str, str] = {}
response_text = ""
error: str | None = None
try:
request = urllib.request.Request(
url,
data=body_bytes,
headers={
"Accept": "application/json",
"Content-Type": "application/json",
"User-Agent": "orderbooks-checkpoint-4-sample/0.1.0",
},
method="POST",
)
with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
status_code = response.status
response_headers = filter_headers(response.headers)
response_text = response.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as exc:
status_code = exc.code
response_headers = filter_headers(exc.headers)
response_text = exc.read().decode("utf-8", errors="replace")
error = f"HTTPError: {exc}"
except Exception as exc: # noqa: BLE001 - preserve request failure evidence
error = f"{type(exc).__name__}: {exc}"
duration_ms = round((time.monotonic() - started_monotonic) * 1000, 3)
parsed_json = None
json_error = None
if response_text:
try:
parsed_json = json.loads(response_text)
except json.JSONDecodeError as exc:
json_error = str(exc)
attempts.append(
{
"attempt": attempt_index + 1,
"started_at_utc": started_at,
"ended_at_utc": iso_z(),
"duration_ms": duration_ms,
"status_code": status_code,
"headers": response_headers,
"error": error,
"json_error": json_error,
}
)
final_json = parsed_json
final_json_error = json_error
final_text_preview = response_text[:1000] if parsed_json is None else None
final_status_code = status_code
final_headers = response_headers
retryable = status_code == 429 or (status_code is not None and 500 <= status_code <= 599)
if error is None and status_code is not None and 200 <= status_code < 300:
break
if not retryable or attempt_index >= max_retries or STOP_REQUESTED:
break
retry_after = response_headers.get("Retry-After") or response_headers.get("retry-after")
sleep_seconds = backoff_seconds * (2**attempt_index)
if retry_after:
try:
sleep_seconds = max(sleep_seconds, float(retry_after))
except ValueError:
pass
time.sleep(sleep_seconds)
return {
"request": {
"method": "POST",
"url": url,
"json_body": json_body,
},
"response": {
"status_code": final_status_code,
"headers": final_headers,
"json": final_json,
"json_error": final_json_error,
"text_preview": final_text_preview,
},
"attempts": attempts,
"duration_ms": round(sum(attempt["duration_ms"] for attempt in attempts), 3),
"ok": final_status_code is not None and 200 <= final_status_code < 300 and final_json_error is None,
}
def load_discovery(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8"))
def market_is_usable(market: dict[str, Any], now: dt.datetime, safety_seconds: int) -> tuple[bool, list[str]]:
reasons: list[str] = []
if market.get("active") is not True:
reasons.append("not_active")
if market.get("closed") is not False:
reasons.append("closed")
if market.get("accepting_orders") is not True:
reasons.append("not_accepting_orders")
if market.get("enable_order_book") is not True:
reasons.append("order_book_not_enabled")
end_time = parse_iso(market.get("end_time_utc"))
if end_time is None:
reasons.append("missing_end_time")
elif end_time <= now + dt.timedelta(seconds=safety_seconds):
reasons.append("too_close_to_end_or_expired")
tokens = market.get("tokens")
if not isinstance(tokens, list) or len(tokens) < 2:
reasons.append("missing_two_tokens")
else:
outcomes = [token.get("outcome") for token in tokens if isinstance(token, dict)]
token_ids = [token.get("token_id") for token in tokens if isinstance(token, dict)]
if outcomes[:2] != ["Up", "Down"] or not all(token_ids[:2]):
reasons.append("bad_up_down_token_mapping")
return not reasons, reasons
def select_markets(
discovery: dict[str, Any],
*,
market_limit: int,
market_end_safety_seconds: int,
) -> tuple[list[dict[str, Any]], dict[str, int]]:
now = utc_now()
selected: list[dict[str, Any]] = []
rejection_counts: dict[str, int] = {}
markets = discovery.get("normalized_markets") or []
for market in markets:
if not isinstance(market, dict):
rejection_counts["not_object"] = rejection_counts.get("not_object", 0) + 1
continue
usable, reasons = market_is_usable(market, now, market_end_safety_seconds)
if not usable:
for reason in reasons:
rejection_counts[reason] = rejection_counts.get(reason, 0) + 1
continue
selected.append(market)
if len(selected) >= market_limit:
break
return selected, dict(sorted(rejection_counts.items()))
def flatten_tokens(markets: list[dict[str, Any]]) -> list[dict[str, Any]]:
tokens: list[dict[str, Any]] = []
for market in markets:
for token in market.get("tokens", [])[:2]:
tokens.append(
{
"market_name": market.get("market_name"),
"market_slug": market.get("market_slug"),
"condition_id": market.get("condition_id"),
"token_id": str(token.get("token_id")),
"outcome": token.get("outcome"),
"market_end_time_utc": market.get("end_time_utc"),
}
)
return tokens
def build_snapshot_envelope(
*,
raw_book: dict[str, Any],
token_meta: dict[str, Any],
collected_at_utc: str,
sequence: int,
request_record: dict[str, Any],
response_index: int,
) -> dict[str, Any]:
return {
"schema_name": SCHEMA_NAME,
"schema_version": SCHEMA_VERSION,
"collector": {
"name": COLLECTOR_NAME,
"version": COLLECTOR_VERSION,
},
"market": {
"market_name": token_meta.get("market_name"),
"market_slug": token_meta.get("market_slug"),
"condition_id": token_meta.get("condition_id"),
"token_id": token_meta.get("token_id"),
"outcome": token_meta.get("outcome"),
"market_end_time_utc": token_meta.get("market_end_time_utc"),
},
"collection": {
"collected_at_utc": collected_at_utc,
"sequence": sequence,
"response_index": response_index,
},
"request": {
"method": request_record["request"]["method"],
"url": request_record["request"]["url"],
"params": None,
"json_body": request_record["request"]["json_body"],
"status_code": request_record["response"]["status_code"],
"duration_ms": request_record["duration_ms"],
"attempts": request_record["attempts"],
},
"raw": raw_book,
}
def summarize_output_file(path: Path, rows_written: int) -> dict[str, Any]:
return {
"path": path.as_posix(),
"status": "valid" if path.exists() and path.stat().st_size > 0 else "missing",
"bytes": path.stat().st_size if path.exists() else 0,
"rows": rows_written,
"sha256": sha256_file(path) if path.exists() else None,
}
def write_manifest(path: Path, manifest: dict[str, Any]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
def config_value(config: dict[str, Any], args: argparse.Namespace, key: str, default: Any) -> Any:
cli_value = getattr(args, key)
if cli_value is not None:
return cli_value
return config.get(key, default)
def build_runtime_config(args: argparse.Namespace) -> dict[str, Any]:
config_path = args.config
file_config = load_flat_yaml(config_path) if config_path else {}
runtime = {
"discovery_path": Path(config_value(file_config, args, "discovery_path", DEFAULT_DISCOVERY_PATH)),
"output_dir": Path(config_value(file_config, args, "output_dir", DEFAULT_OUTPUT_DIR)),
"manifest_path": Path(config_value(file_config, args, "manifest_path", DEFAULT_MANIFEST_PATH)),
"market_limit": int(config_value(file_config, args, "market_limit", 2)),
"interval_seconds": float(config_value(file_config, args, "interval_seconds", 30.0)),
"duration_seconds": float(config_value(file_config, args, "duration_seconds", 300.0)),
"request_timeout_seconds": float(config_value(file_config, args, "request_timeout_seconds", 15.0)),
"max_retries": int(config_value(file_config, args, "max_retries", 2)),
"backoff_seconds": float(config_value(file_config, args, "backoff_seconds", 2.0)),
"market_end_safety_seconds": int(config_value(file_config, args, "market_end_safety_seconds", 420)),
"clob_books_url": str(config_value(file_config, args, "clob_books_url", CLOB_BOOKS_URL)),
"config_path": config_path,
"config_sha256": config_digest(config_path),
"config_snapshot": file_config,
}
if runtime["market_limit"] < 1:
raise ValueError("market_limit must be >= 1")
if runtime["interval_seconds"] <= 0:
raise ValueError("interval_seconds must be > 0")
if runtime["duration_seconds"] <= 0:
raise ValueError("duration_seconds must be > 0")
return runtime
def run_collection(runtime: dict[str, Any], command: str) -> tuple[dict[str, Any], Path]:
signal.signal(signal.SIGINT, handle_stop)
signal.signal(signal.SIGTERM, handle_stop)
started = utc_now()
started_at_utc = iso_z(started)
discovery_path: Path = runtime["discovery_path"]
discovery = load_discovery(discovery_path)
selected_markets, rejection_counts = select_markets(
discovery,
market_limit=runtime["market_limit"],
market_end_safety_seconds=runtime["market_end_safety_seconds"],
)
warnings: list[str] = []
failures: list[dict[str, Any]] = []
if not selected_markets:
warnings.append("No usable active BTC markets found in discovery input.")
tokens = flatten_tokens(selected_markets)
run_id = compact_timestamp(started)
output_dir = runtime["output_dir"] / "polymarket" / "orderbooks" / run_id
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / f"polymarket_orderbooks_{run_id}.jsonl.gz"
request_count = 0
success_count = 0
failure_count = 0
status_code_counts: dict[str, int] = {}
rows_written = 0
sequence = 0
token_row_counts = {token["token_id"]: 0 for token in tokens}
deadline = time.monotonic() + runtime["duration_seconds"]
token_by_id = {token["token_id"]: token for token in tokens}
request_body = [{"token_id": token["token_id"]} for token in tokens]
with gzip.open(output_file, "wt", encoding="utf-8") as handle:
while tokens and not STOP_REQUESTED and time.monotonic() < deadline:
loop_started = time.monotonic()
collected_at_utc = iso_z()
request_count += 1
request_record = http_post_json(
url=runtime["clob_books_url"],
json_body=request_body,
timeout_seconds=runtime["request_timeout_seconds"],
max_retries=runtime["max_retries"],
backoff_seconds=runtime["backoff_seconds"],
)
status_code = request_record["response"]["status_code"]
status_key = str(status_code)
status_code_counts[status_key] = status_code_counts.get(status_key, 0) + 1
if request_record["ok"] and isinstance(request_record["response"]["json"], list):
success_count += 1
for response_index, raw_book in enumerate(request_record["response"]["json"]):
if not isinstance(raw_book, dict):
failure_count += 1
failures.append(
{
"collected_at_utc": collected_at_utc,
"reason": "book_response_item_not_object",
"response_index": response_index,
}
)
continue
asset_id = str(raw_book.get("asset_id") or "")
token_meta = token_by_id.get(asset_id)
if token_meta is None:
failure_count += 1
failures.append(
{
"collected_at_utc": collected_at_utc,
"reason": "unknown_asset_id_in_book_response",
"asset_id": asset_id,
}
)
continue
sequence += 1
envelope = build_snapshot_envelope(
raw_book=raw_book,
token_meta=token_meta,
collected_at_utc=collected_at_utc,
sequence=sequence,
request_record=request_record,
response_index=response_index,
)
handle.write(json.dumps(envelope, separators=(",", ":"), sort_keys=True) + "\n")
rows_written += 1
token_row_counts[asset_id] = token_row_counts.get(asset_id, 0) + 1
handle.flush()
else:
failure_count += 1
failures.append(
{
"collected_at_utc": collected_at_utc,
"reason": "request_failed_or_non_json_list",
"status_code": status_code,
"attempts": request_record["attempts"],
"json_error": request_record["response"]["json_error"],
"text_preview": request_record["response"]["text_preview"],
}
)
remaining_interval = runtime["interval_seconds"] - (time.monotonic() - loop_started)
while remaining_interval > 0 and not STOP_REQUESTED and time.monotonic() < deadline:
sleep_for = min(remaining_interval, deadline - time.monotonic(), 1.0)
if sleep_for <= 0:
break
time.sleep(sleep_for)
remaining_interval = runtime["interval_seconds"] - (time.monotonic() - loop_started)
ended = utc_now()
ended_at_utc = iso_z(ended)
duration_seconds_actual = round((ended - started).total_seconds(), 3)
if STOP_REQUESTED:
warnings.append(f"Graceful shutdown requested by {STOP_SIGNAL}.")
if runtime["duration_seconds"] < 300:
warnings.append("Configured run duration was shorter than the roadmap 5-minute sample target.")
if not failures and request_count > 0:
failures = []
output_summary = summarize_output_file(output_file, rows_written)
gate_status = "PASS" if rows_written > 0 and all(count > 0 for count in token_row_counts.values()) else "FAIL"
if not tokens:
gate_status = "BLOCKED"
if request_count == 0:
gate_status = "FAIL" if tokens else "BLOCKED"
manifest = {
"schema_name": "orderbook_collector_sample_manifest",
"schema_version": 1,
"checkpoint_id": 4,
"checkpoint_name": "Minimal Orderbook Snapshot Collector",
"gate_status": gate_status,
"collector": {
"name": COLLECTOR_NAME,
"version": COLLECTOR_VERSION,
},
"started_at_utc": started_at_utc,
"ended_at_utc": ended_at_utc,
"run_duration_seconds": duration_seconds_actual,
"configured_duration_seconds": runtime["duration_seconds"],
"interval_seconds": runtime["interval_seconds"],
"command": command,
"config": {
"path": runtime["config_path"].as_posix() if runtime["config_path"] else None,
"sha256": runtime["config_sha256"],
"snapshot": runtime["config_snapshot"],
"effective": {
"discovery_path": discovery_path.as_posix(),
"output_dir": runtime["output_dir"].as_posix(),
"manifest_path": runtime["manifest_path"].as_posix(),
"market_limit": runtime["market_limit"],
"interval_seconds": runtime["interval_seconds"],
"duration_seconds": runtime["duration_seconds"],
"request_timeout_seconds": runtime["request_timeout_seconds"],
"max_retries": runtime["max_retries"],
"backoff_seconds": runtime["backoff_seconds"],
"market_end_safety_seconds": runtime["market_end_safety_seconds"],
"clob_books_url": runtime["clob_books_url"],
},
},
"discovery": {
"path": discovery_path.as_posix(),
"fetched_at_utc": discovery.get("fetched_at_utc"),
"source_summary": discovery.get("summary"),
"rejection_counts_before_selection": rejection_counts,
},
"markets_tracked": [
{
"market_name": market.get("market_name"),
"market_slug": market.get("market_slug"),
"condition_id": market.get("condition_id"),
"end_time_utc": market.get("end_time_utc"),
}
for market in selected_markets
],
"tokens_tracked": tokens,
"request_count": request_count,
"success_count": success_count,
"failure_count": failure_count,
"status_code_counts": dict(sorted(status_code_counts.items())),
"rows_written": rows_written,
"token_row_counts": token_row_counts,
"output_files": [output_summary],
"failures": failures,
"warnings": warnings,
"known_gaps": [
"This is a short run-rotated sample, not a daemon.",
"Hourly rotation is documented but not implemented in this checkpoint.",
"No websocket capture, normalization, upload, systemd unit, dashboard, database, or trading behavior is included.",
"A 5-minute sample proves file-writing behavior only; it does not prove 24/7 reliability.",
],
"fake_progress_risk": "A small successful sample can still hide long-run gaps, stale discovery, endpoint schema drift, and missed intervals. Reliability remains gated on the future 24h soak test.",
"next_step": "Checkpoint 5 should normalize this raw sample while preserving raw file references, or rerun a fresh short sample if the orchestrator wants more raw evidence first.",
}
return manifest, output_file
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Collect a bounded raw gzip JSONL sample of Polymarket BTC order books."
)
parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH)
parser.add_argument("--discovery-path", type=Path, default=None)
parser.add_argument("--output-dir", type=Path, default=None)
parser.add_argument("--manifest-path", type=Path, default=None)
parser.add_argument("--market-limit", type=int, default=None)
parser.add_argument("--interval-seconds", type=float, default=None)
parser.add_argument("--duration-seconds", type=float, default=None)
parser.add_argument("--request-timeout-seconds", type=float, default=None)
parser.add_argument("--max-retries", type=int, default=None)
parser.add_argument("--backoff-seconds", type=float, default=None)
parser.add_argument("--market-end-safety-seconds", type=int, default=None)
parser.add_argument("--clob-books-url", type=str, default=None)
return parser.parse_args()
def main() -> int:
args = parse_args()
command = " ".join([Path(sys.argv[0]).as_posix(), *sys.argv[1:]])
runtime = build_runtime_config(args)
manifest, output_file = run_collection(runtime, command)
write_manifest(runtime["manifest_path"], manifest)
print(
json.dumps(
{
"gate_status": manifest["gate_status"],
"manifest_path": runtime["manifest_path"].as_posix(),
"output_file": output_file.as_posix(),
"markets_tracked": manifest["markets_tracked"],
"tokens_tracked": len(manifest["tokens_tracked"]),
"request_count": manifest["request_count"],
"success_count": manifest["success_count"],
"failure_count": manifest["failure_count"],
"rows_written": manifest["rows_written"],
"warnings": manifest["warnings"],
},
indent=2,
sort_keys=True,
)
)
return 0 if manifest["gate_status"] == "PASS" else 1
if __name__ == "__main__":
raise SystemExit(main())

View file

@ -0,0 +1,146 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "$0")/../.." && pwd)"
PLATFORM_REPO_DIR="${PLATFORM_REPO_DIR:-/home/philipp/dev/ae/nuri/unrip3}"
PLATFORM_ENV_FILE="${PLATFORM_ENV_FILE:-$PLATFORM_REPO_DIR/scripts/hetzner/bootstrap-secrets.env}"
PLATFORM_RESOLVED_ENV_FILE="${PLATFORM_RESOLVED_ENV_FILE:-$PLATFORM_REPO_DIR/.state/hetzner/bootstrap-secrets.resolved.env}"
KUBECONFIG_PATH="${KUBECONFIG_PATH:-$PLATFORM_REPO_DIR/.state/hetzner/kubeconfig.yaml}"
CI_KUBECONFIG_PATH="${CI_KUBECONFIG_PATH:-$PLATFORM_REPO_DIR/.state/hetzner/kubeconfig.incluster.yaml}"
PROJECT_NAME="${PROJECT_NAME:-orderbooks}"
PROJECT_NAMESPACE="${PROJECT_NAMESPACE:-orderbooks}"
PROJECT_DEPLOYMENTS="${PROJECT_DEPLOYMENTS:-orderbooks-collector}"
PROJECT_REGISTRY_SECRET_NAME="${PROJECT_REGISTRY_SECRET_NAME:-orderbooks-registry-creds}"
RCLONE_SECRET_NAME="${RCLONE_SECRET_NAME:-orderbooks-rclone-config}"
RCLONE_SECRET_KEY="${RCLONE_SECRET_KEY:-rclone.conf}"
FORGEJO_REPO_OWNER="${FORGEJO_REPO_OWNER:-philipp}"
FORGEJO_REPO_NAME="${FORGEJO_REPO_NAME:-orderbooks}"
FORGEJO_REPO_PRIVATE="${FORGEJO_REPO_PRIVATE:-0}"
require() {
command -v "$1" >/dev/null 2>&1 || {
echo "missing required command: $1" >&2
exit 1
}
}
load_env_defaults() {
local file="$1"
[[ -f "$file" ]] || return 0
eval "$(
python3 - "$file" <<'PY_LOAD_ENV'
import os
import shlex
import sys
for raw in open(sys.argv[1], 'r', encoding='utf-8'):
line = raw.strip()
if not line or line.startswith('#'):
continue
if line.startswith('export '):
line = line[len('export '):]
if '=' not in line:
continue
key, value = line.split('=', 1)
key = key.strip()
value = value.strip()
if len(value) >= 2 and value[0] == value[-1] and value[0] in {'\"', "'"}:
value = value[1:-1]
if key in os.environ:
continue
print(f'export {key}={shlex.quote(value)}')
PY_LOAD_ENV
)"
}
require kubectl
require python3
require base64
load_env_defaults "$PLATFORM_ENV_FILE"
load_env_defaults "$PLATFORM_RESOLVED_ENV_FILE"
# Force orderbooks app identity after loading platform defaults. The platform
# env file may describe the platform repo itself, not this app repo.
PROJECT_NAME="${ORDERBOOKS_PROJECT_NAME:-orderbooks}"
PROJECT_NAMESPACE="${ORDERBOOKS_PROJECT_NAMESPACE:-orderbooks}"
PROJECT_DEPLOYMENTS="${ORDERBOOKS_PROJECT_DEPLOYMENTS:-orderbooks-collector}"
PROJECT_REGISTRY_SECRET_NAME="${ORDERBOOKS_PROJECT_REGISTRY_SECRET_NAME:-orderbooks-registry-creds}"
RCLONE_SECRET_NAME="${ORDERBOOKS_RCLONE_SECRET_NAME:-orderbooks-rclone-config}"
RCLONE_SECRET_KEY="${ORDERBOOKS_RCLONE_SECRET_KEY:-rclone.conf}"
FORGEJO_REPO_OWNER="${ORDERBOOKS_FORGEJO_REPO_OWNER:-philipp}"
FORGEJO_REPO_NAME="${ORDERBOOKS_FORGEJO_REPO_NAME:-orderbooks}"
FORGEJO_REPO_PRIVATE="${ORDERBOOKS_FORGEJO_REPO_PRIVATE:-0}"
: "${KUBECONFIG_PATH:?missing kubeconfig path}"
: "${CI_KUBECONFIG_PATH:?missing CI kubeconfig path}"
[[ -f "$KUBECONFIG_PATH" ]] || { echo "missing kubeconfig file" >&2; exit 1; }
[[ -f "$CI_KUBECONFIG_PATH" ]] || { echo "missing in-cluster kubeconfig file" >&2; exit 1; }
export KUBECONFIG="$KUBECONFIG_PATH"
if [[ -z "${FORGEJO_URL:-}" ]]; then
if [[ -n "${FORGEJO_ROOT_URL:-}" ]]; then
FORGEJO_URL="$FORGEJO_ROOT_URL"
elif [[ -n "${FORGEJO_DOMAIN:-}" ]]; then
FORGEJO_URL="https://${FORGEJO_DOMAIN}"
else
echo "missing Forgejo URL" >&2
exit 1
fi
fi
: "${FORGEJO_ADMIN_USERNAME:?missing Forgejo admin username}"
if [[ -z "${FORGEJO_TOKEN:-}" ]]; then
: "${FORGEJO_ADMIN_PASSWORD:?missing Forgejo password or token}"
fi
if [[ -z "${REGISTRY_HOST:-}" ]]; then
if [[ -n "${REGISTRY_DOMAIN:-}" ]]; then
REGISTRY_HOST="$REGISTRY_DOMAIN"
else
echo "missing registry host" >&2
exit 1
fi
fi
: "${REGISTRY_USERNAME:?missing registry username}"
: "${REGISTRY_PASSWORD:?missing registry password}"
echo "ensuring namespace ${PROJECT_NAMESPACE}"
kubectl create namespace "$PROJECT_NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
echo "upserting registry secret ${PROJECT_REGISTRY_SECRET_NAME}"
kubectl -n "$PROJECT_NAMESPACE" create secret docker-registry "$PROJECT_REGISTRY_SECRET_NAME" \
--docker-server="$REGISTRY_HOST" \
--docker-username="$REGISTRY_USERNAME" \
--docker-password="$REGISTRY_PASSWORD" \
--dry-run=client -o yaml | kubectl apply -f -
echo "checking rclone secret key presence"
kubectl -n "$PROJECT_NAMESPACE" get secret "$RCLONE_SECRET_NAME" \
-o "go-template={{if index .data \"${RCLONE_SECRET_KEY}\"}}rclone_secret_key_present{{else}}rclone_secret_key_missing{{end}}{{\"\\n\"}}"
echo "upserting Forgejo repo and Actions settings"
forgejo_args=()
if [[ -n "${FORGEJO_TOKEN:-}" ]]; then
forgejo_args+=(--token "$FORGEJO_TOKEN")
else
forgejo_args+=(--admin-username "$FORGEJO_ADMIN_USERNAME" --admin-password "$FORGEJO_ADMIN_PASSWORD")
fi
if [[ "$FORGEJO_REPO_PRIVATE" == "1" || "$FORGEJO_REPO_PRIVATE" == "true" ]]; then
forgejo_args+=(--repo-private)
fi
python3 "$ROOT_DIR/scripts/deploy/forgejo_repo_bootstrap.py" \
--forgejo-url "$FORGEJO_URL" \
--repo-owner "$FORGEJO_REPO_OWNER" \
--repo-name "$FORGEJO_REPO_NAME" \
--ci-kubeconfig "$CI_KUBECONFIG_PATH" \
--registry-host "$REGISTRY_HOST" \
--project-name "$PROJECT_NAME" \
--project-namespace "$PROJECT_NAMESPACE" \
--project-deployments "$PROJECT_DEPLOYMENTS" \
--project-registry-secret-name "$PROJECT_REGISTRY_SECRET_NAME" \
"${forgejo_args[@]}"
echo "bootstrap complete for ${FORGEJO_REPO_OWNER}/${FORGEJO_REPO_NAME} in namespace ${PROJECT_NAMESPACE}"

View file

@ -0,0 +1,121 @@
#!/usr/bin/env python3
"""Orderbooks-specific Forgejo repo bootstrap.
Creates/updates the Forgejo repository plus Actions settings for the Kubernetes
orderbooks deployment. This script deliberately does not print secret values.
"""
from __future__ import annotations
import argparse
import base64
import json
import ssl
import urllib.error
import urllib.parse
import urllib.request
from pathlib import Path
class ForgejoClient:
def __init__(self, base_url: str, username: str | None = None, password: str | None = None, token: str | None = None):
self.base_url = base_url.rstrip('/')
self.username = username or ''
self.headers = {'Accept': 'application/json', 'Content-Type': 'application/json'}
if token:
self.headers['Authorization'] = f'token {token}'
elif username is not None and password is not None:
credentials = base64.b64encode(f'{username}:{password}'.encode()).decode()
self.headers['Authorization'] = f'Basic {credentials}'
else:
raise ValueError('ForgejoClient requires either token auth or username/password auth')
self.ssl_context = ssl.create_default_context()
def request(self, method: str, path: str, payload=None, expected=(200, 201, 204)):
data = json.dumps(payload).encode() if payload is not None else None
req = urllib.request.Request(f'{self.base_url}{path}', data=data, method=method)
for key, value in self.headers.items():
req.add_header(key, value)
try:
with urllib.request.urlopen(req, context=self.ssl_context) as response:
body = response.read().decode() if response.length != 0 else ''
if response.status not in expected:
raise RuntimeError(f'{method} {path} returned {response.status}: {body[:500]}')
return json.loads(body) if body else None
except urllib.error.HTTPError as exc:
body = exc.read().decode()
if exc.code not in expected:
raise RuntimeError(f'{method} {path} returned {exc.code}: {body[:500]}') from exc
return json.loads(body) if body else None
def get_repo(self, owner: str, repo: str):
try:
return self.request('GET', f'/api/v1/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(repo)}')
except RuntimeError as exc:
if ' returned 404:' in str(exc):
return None
raise
def create_repo(self, owner: str, name: str, private: bool):
payload = {'name': name, 'private': private, 'auto_init': False, 'default_branch': 'main'}
if owner == self.username:
return self.request('POST', '/api/v1/user/repos', payload, expected=(201,))
return self.request('POST', f'/api/v1/orgs/{urllib.parse.quote(owner)}/repos', payload, expected=(201,))
def upsert_variable(self, owner: str, repo: str, name: str, value: str):
path = f'/api/v1/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(repo)}/actions/variables/{urllib.parse.quote(name)}'
try:
self.request('POST', path, {'value': value}, expected=(201, 204))
except RuntimeError as exc:
if ' returned 409:' not in str(exc) and ' returned 422:' not in str(exc):
raise
self.request('PUT', path, {'value': value}, expected=(201, 204))
def upsert_secret(self, owner: str, repo: str, name: str, value: str):
path = f'/api/v1/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(repo)}/actions/secrets/{urllib.parse.quote(name)}'
self.request('PUT', path, {'data': value}, expected=(201, 204))
def main() -> None:
parser = argparse.ArgumentParser(description='Bootstrap Forgejo Actions settings for orderbooks')
parser.add_argument('--forgejo-url', required=True)
parser.add_argument('--admin-username')
parser.add_argument('--admin-password')
parser.add_argument('--token')
parser.add_argument('--repo-owner', required=True)
parser.add_argument('--repo-name', required=True)
parser.add_argument('--repo-private', action='store_true')
parser.add_argument('--ci-kubeconfig', required=True)
parser.add_argument('--registry-host', required=True)
parser.add_argument('--project-name', required=True)
parser.add_argument('--project-namespace', required=True)
parser.add_argument('--project-deployments', required=True)
parser.add_argument('--project-registry-secret-name', required=True)
args = parser.parse_args()
client = ForgejoClient(args.forgejo_url, args.admin_username, args.admin_password, args.token)
repo = client.get_repo(args.repo_owner, args.repo_name)
if repo is None:
created = client.create_repo(args.repo_owner, args.repo_name, args.repo_private)
print(f'created repo {created["full_name"]}')
else:
print(f'repo already exists: {repo["full_name"]}')
kubeconfig_b64 = base64.b64encode(Path(args.ci_kubeconfig).read_bytes()).decode()
client.upsert_secret(args.repo_owner, args.repo_name, 'KUBECONFIG_B64', kubeconfig_b64)
print('upserted repo action secret KUBECONFIG_B64')
variables = {
'REGISTRY_HOST': args.registry_host,
'PROJECT_NAME': args.project_name,
'PROJECT_NAMESPACE': args.project_namespace,
'PROJECT_DEPLOYMENTS': args.project_deployments,
'PROJECT_REGISTRY_SECRET_NAME': args.project_registry_secret_name,
}
for name, value in variables.items():
client.upsert_variable(args.repo_owner, args.repo_name, name, value)
print('upserted repo action variables')
if __name__ == '__main__':
main()

View file

@ -0,0 +1,752 @@
#!/usr/bin/env python3
"""Discover active Polymarket BTC up/down markets.
Checkpoint 3 scope: fetch bounded public Gamma metadata, preserve raw responses,
and write normalized market records with outcome-token mappings. This is not an
order-book collector.
"""
from __future__ import annotations
import argparse
import datetime as dt
import hashlib
import json
import sys
import time
import urllib.error
import urllib.parse
import urllib.request
from pathlib import Path
from typing import Any
GAMMA_EVENTS_URL = "https://gamma-api.polymarket.com/events"
BTC_TAG_ID = 235
DEFAULT_OUTPUT_JSON = Path("data/discovery/polymarket_btc_markets_latest.json")
DEFAULT_MANIFEST = Path("data/discovery/polymarket_btc_markets_manifest.json")
DEFAULT_MARKDOWN = Path("data/discovery/polymarket_btc_markets.md")
SAFE_RESPONSE_HEADERS = {
"age",
"cache-control",
"cf-cache-status",
"cf-ray",
"content-encoding",
"content-length",
"content-type",
"date",
"expires",
"last-modified",
"ratelimit-limit",
"ratelimit-remaining",
"ratelimit-reset",
"retry-after",
"server",
"strict-transport-security",
"x-ratelimit-limit",
"x-ratelimit-remaining",
"x-ratelimit-reset",
}
FILTER_RULES = [
"Use public Gamma /events with tag_id=235, related_tags=true, active=true, closed=false.",
"Require event.active=true and event.closed=false.",
"Require market.active=true and market.closed=false.",
"Require market.enableOrderBook=true.",
"Require market.acceptingOrders=true unless --allow-non-accepting-orders is used.",
"Require market end time to be after the fetch time unless --allow-expired is used.",
"Require outcomes to resolve to exactly Up and Down.",
"Require clobTokenIds to resolve to exactly two token IDs.",
"Require BTC/up-down evidence from seriesSlug, title/slug text, or tags.",
]
def utc_now() -> dt.datetime:
return dt.datetime.now(dt.UTC)
def iso_z(value: dt.datetime | None = None) -> str:
value = value or utc_now()
return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
def parse_iso(value: Any) -> dt.datetime | None:
if not isinstance(value, str) or not value.strip():
return None
text = value.strip()
if text.endswith("Z"):
text = text[:-1] + "+00:00"
try:
parsed = dt.datetime.fromisoformat(text)
except ValueError:
return None
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=dt.UTC)
return parsed.astimezone(dt.UTC)
def sha256_file(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def filter_headers(headers: Any) -> dict[str, str]:
safe: dict[str, str] = {}
for key, value in dict(headers).items():
if key.lower() in SAFE_RESPONSE_HEADERS:
safe[key] = value
return safe
def normalize_params(params: dict[str, Any]) -> dict[str, Any]:
normalized: dict[str, Any] = {}
for key, value in params.items():
if isinstance(value, bool):
normalized[key] = "true" if value else "false"
else:
normalized[key] = value
return normalized
def build_url(url: str, params: dict[str, Any]) -> str:
query = urllib.parse.urlencode(normalize_params(params), doseq=True)
return f"{url}?{query}"
def fetch_json_page(
*,
name: str,
url: str,
params: dict[str, Any],
timeout_seconds: float,
) -> dict[str, Any]:
started_monotonic = time.monotonic()
started_at_utc = iso_z()
full_url = build_url(url, params)
request = urllib.request.Request(
full_url,
headers={
"Accept": "application/json",
"User-Agent": "orderbooks-checkpoint-3-discovery/1.0",
},
method="GET",
)
status_code: int | None = None
response_headers: dict[str, str] = {}
response_text = ""
error: str | None = None
try:
with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
status_code = response.status
response_headers = filter_headers(response.headers)
response_text = response.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as exc:
status_code = exc.code
response_headers = filter_headers(exc.headers)
response_text = exc.read().decode("utf-8", errors="replace")
error = f"HTTPError: {exc}"
except Exception as exc: # noqa: BLE001 - preserve probe failure evidence
error = f"{type(exc).__name__}: {exc}"
response_json: Any | None = None
json_error: str | None = None
if response_text:
try:
response_json = json.loads(response_text)
except json.JSONDecodeError as exc:
json_error = str(exc)
return {
"name": name,
"started_at_utc": started_at_utc,
"ended_at_utc": iso_z(),
"duration_ms": round((time.monotonic() - started_monotonic) * 1000, 3),
"request": {
"method": "GET",
"url": url,
"full_url": full_url,
"params": normalize_params(params),
},
"response": {
"status_code": status_code,
"headers": response_headers,
"json": response_json,
"json_error": json_error,
"text_preview": response_text[:1000] if response_json is None else None,
},
"ok": error is None and status_code is not None and 200 <= status_code < 300,
"error": error,
}
def coerce_json_array(value: Any) -> list[Any]:
if isinstance(value, list):
return value
if isinstance(value, str):
try:
parsed = json.loads(value)
except json.JSONDecodeError:
return []
return parsed if isinstance(parsed, list) else []
return []
def lower_text(value: Any) -> str:
return str(value or "").lower()
def event_tag_text(event: dict[str, Any]) -> str:
parts: list[str] = []
for tag in event.get("tags") or []:
if isinstance(tag, dict):
parts.append(str(tag.get("slug") or ""))
parts.append(str(tag.get("label") or ""))
return " ".join(parts).lower()
def has_btc_up_down_evidence(event: dict[str, Any], market: dict[str, Any]) -> bool:
series_slug = lower_text(event.get("seriesSlug"))
text = " ".join(
lower_text(event.get(key))
for key in ("title", "slug", "ticker", "description")
)
text += " " + " ".join(
lower_text(market.get(key))
for key in ("question", "slug", "description")
)
tags = event_tag_text(event)
series_match = series_slug.startswith("btc-up-or-down")
text_match = ("bitcoin" in text or "btc" in text) and "up" in text and "down" in text
tag_match = ("bitcoin" in tags or "btc" in tags) and "up-or-down" in tags
return bool(series_match or text_match or tag_match)
def is_up_down_outcomes(outcomes: list[str]) -> bool:
return len(outcomes) == 2 and {item.lower() for item in outcomes} == {"up", "down"}
def normalize_market(
*,
event: dict[str, Any],
market: dict[str, Any],
page_index: int,
event_index: int,
market_index: int,
fetched_at_utc: str,
output_json_path: Path,
) -> dict[str, Any]:
outcomes = [str(item) for item in coerce_json_array(market.get("outcomes"))]
token_ids = [str(item) for item in coerce_json_array(market.get("clobTokenIds"))]
tokens = [
{
"outcome": outcomes[index],
"token_id": token_ids[index],
"outcome_index": index,
}
for index in range(min(len(outcomes), len(token_ids)))
]
start_time = (
market.get("startDate")
or market.get("startDateIso")
or event.get("startDate")
or event.get("creationDate")
)
end_time = market.get("endDate") or market.get("endDateIso") or event.get("endDate")
event_slug = event.get("slug")
market_slug = market.get("slug") or event_slug
return {
"market_name": "polymarket",
"market_slug": market_slug,
"event_slug": event_slug,
"title": event.get("title") or market.get("question"),
"question": market.get("question") or event.get("title"),
"condition_id": market.get("conditionId"),
"tokens": tokens,
"outcomes": outcomes,
"start_time_utc": iso_z(parse_iso(start_time)) if parse_iso(start_time) else start_time,
"end_time_utc": iso_z(parse_iso(end_time)) if parse_iso(end_time) else end_time,
"active": market.get("active"),
"closed": market.get("closed"),
"event_active": event.get("active"),
"event_closed": event.get("closed"),
"accepting_orders": market.get("acceptingOrders"),
"enable_order_book": market.get("enableOrderBook"),
"endpoint_source": {
"name": "gamma_events_bitcoin_tag",
"method": "GET",
"url": GAMMA_EVENTS_URL,
"params_basis": {
"tag_id": BTC_TAG_ID,
"related_tags": "true",
"active": "true",
"closed": "false",
"order": "endDate",
"ascending": "true",
},
},
"fetched_at_utc": fetched_at_utc,
"raw_ref": {
"artifact_path": output_json_path.as_posix(),
"section": "raw.gamma_events_pages",
"page_index": page_index,
"event_index": event_index,
"market_index": market_index,
"json_path": f"raw.gamma_events_pages[{page_index}].response.json[{event_index}].markets[{market_index}]",
},
}
def rejection_reasons(
*,
event: dict[str, Any],
market: dict[str, Any],
fetched_at: dt.datetime,
require_accepting_orders: bool,
require_future_end: bool,
) -> list[str]:
reasons: list[str] = []
outcomes = [str(item) for item in coerce_json_array(market.get("outcomes"))]
token_ids = [str(item) for item in coerce_json_array(market.get("clobTokenIds"))]
end_time = parse_iso(market.get("endDate") or event.get("endDate"))
if event.get("active") is not True:
reasons.append("event_not_active")
if event.get("closed") is not False:
reasons.append("event_closed")
if market.get("active") is not True:
reasons.append("market_not_active")
if market.get("closed") is not False:
reasons.append("market_closed")
if market.get("enableOrderBook") is not True:
reasons.append("order_book_not_enabled")
if require_accepting_orders and market.get("acceptingOrders") is not True:
reasons.append("not_accepting_orders")
if require_future_end and (end_time is None or end_time <= fetched_at):
reasons.append("not_future_end")
if not is_up_down_outcomes(outcomes):
reasons.append("not_up_down_outcomes")
if len(token_ids) != 2:
reasons.append("missing_two_clob_token_ids")
if not has_btc_up_down_evidence(event, market):
reasons.append("missing_btc_up_down_evidence")
return reasons
def discover(args: argparse.Namespace) -> dict[str, Any]:
started_at_utc = iso_z()
fetched_at = utc_now()
fetched_at_utc = iso_z(fetched_at)
raw_pages: list[dict[str, Any]] = []
normalized: list[dict[str, Any]] = []
rejected_counts: dict[str, int] = {}
warnings: list[str] = []
seen_conditions: set[str] = set()
for page_index in range(args.max_pages):
offset = page_index * args.limit
params = {
"tag_id": BTC_TAG_ID,
"related_tags": True,
"active": True,
"closed": False,
"limit": args.limit,
"offset": offset,
"order": "endDate",
"ascending": True,
}
page = fetch_json_page(
name=f"gamma_events_bitcoin_tag_page_{page_index}",
url=GAMMA_EVENTS_URL,
params=params,
timeout_seconds=args.timeout,
)
raw_pages.append(page)
payload = page["response"]["json"]
if not page["ok"]:
warnings.append(
f"Page {page_index} request failed with status {page['response']['status_code']}: {page['error']}"
)
break
if not isinstance(payload, list):
warnings.append(f"Page {page_index} response was not a JSON list.")
break
for event_index, event in enumerate(payload):
if not isinstance(event, dict):
rejected_counts["event_not_object"] = rejected_counts.get("event_not_object", 0) + 1
continue
markets = event.get("markets") or []
if not isinstance(markets, list) or not markets:
rejected_counts["missing_markets"] = rejected_counts.get("missing_markets", 0) + 1
continue
for market_index, market in enumerate(markets):
if not isinstance(market, dict):
rejected_counts["market_not_object"] = rejected_counts.get("market_not_object", 0) + 1
continue
reasons = rejection_reasons(
event=event,
market=market,
fetched_at=fetched_at,
require_accepting_orders=not args.allow_non_accepting_orders,
require_future_end=not args.allow_expired,
)
if reasons:
for reason in reasons:
rejected_counts[reason] = rejected_counts.get(reason, 0) + 1
continue
condition_id = str(market.get("conditionId") or "")
if condition_id in seen_conditions:
rejected_counts["duplicate_condition_id"] = rejected_counts.get(
"duplicate_condition_id", 0
) + 1
continue
seen_conditions.add(condition_id)
normalized.append(
normalize_market(
event=event,
market=market,
page_index=page_index,
event_index=event_index,
market_index=market_index,
fetched_at_utc=fetched_at_utc,
output_json_path=args.output_json,
)
)
if len(payload) < args.limit:
break
normalized.sort(key=lambda item: (item.get("end_time_utc") or "", item.get("market_slug") or ""))
if raw_pages:
last_payload = raw_pages[-1]["response"].get("json")
if isinstance(last_payload, list) and len(last_payload) == args.limit and len(raw_pages) >= args.max_pages:
warnings.append(
"Discovery stopped at max_pages before exhausting Gamma pagination; output is bounded to the fetched pages."
)
if len(normalized) < args.min_markets:
warnings.append(
f"Only {len(normalized)} markets passed filters; min_markets={args.min_markets}."
)
status = "PASS" if len(normalized) >= args.min_markets else "FAIL"
status_reason = (
f"Discovered {len(normalized)} active BTC up/down markets with condition IDs and two token IDs."
if status == "PASS"
else "Did not discover enough active BTC up/down markets with condition IDs and two token IDs."
)
return {
"schema_name": "polymarket_btc_market_discovery",
"schema_version": 1,
"artifact_status": "valid" if status == "PASS" else "partial",
"checkpoint_id": 3,
"checkpoint_name": "Minimal BTC Market Discovery",
"started_at_utc": started_at_utc,
"ended_at_utc": iso_z(),
"fetched_at_utc": fetched_at_utc,
"scope": "Bounded public Gamma metadata discovery only; no order-book collector.",
"endpoint_basis": {
"source_checkpoint": "Checkpoint 2",
"source_report": "reports/checkpoints/checkpoint_002_polymarket_public_sources.md",
"endpoint": GAMMA_EVENTS_URL,
"method": "GET",
"base_params": {
"tag_id": BTC_TAG_ID,
"related_tags": True,
"active": True,
"closed": False,
"limit": args.limit,
"order": "endDate",
"ascending": True,
},
},
"filter_rules": FILTER_RULES,
"normalized_markets": normalized,
"raw": {
"gamma_events_pages": raw_pages,
},
"summary": {
"status": status,
"status_reason": status_reason,
"raw_pages_fetched": len(raw_pages),
"raw_events_fetched": sum(
len(page["response"].get("json") or [])
for page in raw_pages
if isinstance(page["response"].get("json"), list)
),
"normalized_market_count": len(normalized),
"rejected_counts": dict(sorted(rejected_counts.items())),
"warnings": warnings,
},
"fake_progress_risk": "Discovery can appear successful while silently missing markets if filters rely on stale text assumptions or bounded pagination. Raw pages and rejection counts are preserved so missed-market risk can be audited.",
"next_step": "Checkpoint 4 should use this discovery output as input for a short, raw-first order-book snapshot sample; do not claim reliability until the later 24h soak test.",
}
def markdown_table_row(values: list[Any]) -> str:
return "| " + " | ".join(str(value).replace("\n", " ") for value in values) + " |"
def write_markdown(discovery: dict[str, Any], path: Path) -> None:
summary = discovery["summary"]
rows = discovery["normalized_markets"]
lines = [
"# Polymarket BTC Markets Discovery",
"",
f"Artifact status: `{discovery['artifact_status']}`",
"",
"## Gate",
"",
f"Status: `{summary['status']}`",
"",
summary["status_reason"],
"",
"## Scope",
"",
"Bounded public Gamma metadata discovery only. No order-book collection, no trading, no private endpoints, no secrets.",
"",
"## Endpoint",
"",
f"- `GET {GAMMA_EVENTS_URL}`",
"- Params: `tag_id=235`, `related_tags=true`, `active=true`, `closed=false`, `order=endDate`, `ascending=true`, bounded by `limit` and `max_pages`.",
"",
"## Summary",
"",
markdown_table_row(["Metric", "Value"]),
markdown_table_row(["---", "---"]),
markdown_table_row(["fetched_at_utc", discovery["fetched_at_utc"]]),
markdown_table_row(["raw_pages_fetched", summary["raw_pages_fetched"]]),
markdown_table_row(["raw_events_fetched", summary["raw_events_fetched"]]),
markdown_table_row(["normalized_market_count", summary["normalized_market_count"]]),
"",
"## Markets",
"",
markdown_table_row(
[
"market_slug",
"end_time_utc",
"condition_id",
"outcomes",
"token_ids",
"accepting_orders",
]
),
markdown_table_row(["---", "---", "---", "---", "---", "---"]),
]
for row in rows:
token_ids = [token["token_id"] for token in row["tokens"]]
lines.append(
markdown_table_row(
[
row.get("market_slug"),
row.get("end_time_utc"),
row.get("condition_id"),
json.dumps(row.get("outcomes")),
json.dumps(token_ids),
row.get("accepting_orders"),
]
)
)
lines.extend(
[
"",
"## Warnings",
"",
]
)
if summary["warnings"]:
for warning in summary["warnings"]:
lines.append(f"- {warning}")
else:
lines.append("- None.")
lines.extend(
[
"",
"## Rejection Counts",
"",
"```json",
json.dumps(summary["rejected_counts"], indent=2, sort_keys=True),
"```",
"",
"## Raw Preservation",
"",
"The latest JSON artifact stores raw Gamma response envelopes under `raw.gamma_events_pages`. Each normalized record has a `raw_ref` pointing back to the source event market.",
"",
"## Strongest Fake-Progress Risk",
"",
discovery["fake_progress_risk"],
"",
"## Next Smallest Step",
"",
discovery["next_step"],
"",
]
)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines), encoding="utf-8")
def write_manifest(
*,
discovery: dict[str, Any],
manifest_path: Path,
output_json: Path,
markdown_path: Path,
command: str,
) -> None:
status = discovery["summary"]["status"]
output_files = [
{
"path": output_json.as_posix(),
"kind": "latest_discovery_json",
"status": "valid" if output_json.exists() and output_json.stat().st_size else "missing",
"sha256": sha256_file(output_json) if output_json.exists() else None,
},
{
"path": markdown_path.as_posix(),
"kind": "discovery_markdown",
"status": "valid" if markdown_path.exists() and markdown_path.stat().st_size else "missing",
"sha256": sha256_file(markdown_path) if markdown_path.exists() else None,
},
]
script_path = Path("scripts/discover_polymarket_btc_markets.py")
if script_path.exists():
output_files.append(
{
"path": script_path.as_posix(),
"kind": "discovery_script",
"status": "valid",
"sha256": sha256_file(script_path),
}
)
status_codes: dict[str, int] = {}
for page in discovery["raw"]["gamma_events_pages"]:
code = str(page["response"].get("status_code"))
status_codes[code] = status_codes.get(code, 0) + 1
manifest = {
"schema_name": "polymarket_btc_markets_manifest",
"schema_version": 1,
"checkpoint_id": 3,
"checkpoint_name": "Minimal BTC Market Discovery",
"status": status,
"started_at_utc": discovery["started_at_utc"],
"ended_at_utc": discovery["ended_at_utc"],
"scope": discovery["scope"],
"command": command,
"endpoint": discovery["endpoint_basis"],
"request_counts": {
"gamma_events_pages": discovery["summary"]["raw_pages_fetched"],
"status_code_counts": dict(sorted(status_codes.items())),
},
"row_counts": {
"raw_events_fetched": discovery["summary"]["raw_events_fetched"],
"normalized_markets": discovery["summary"]["normalized_market_count"],
},
"market_ids": [
{
"market_slug": row.get("market_slug"),
"condition_id": row.get("condition_id"),
"token_ids": [token.get("token_id") for token in row.get("tokens", [])],
}
for row in discovery["normalized_markets"]
],
"output_files": output_files,
"warnings": discovery["summary"]["warnings"],
"validation": {
"summary": discovery["summary"]["status_reason"],
"required_record_fields": [
"market_name",
"market_slug",
"question",
"condition_id",
"tokens",
"outcomes",
"start_time_utc",
"end_time_utc",
"active",
"closed",
"accepting_orders",
"enable_order_book",
"endpoint_source",
"fetched_at_utc",
"raw_ref",
],
},
"fake_progress_risk": discovery["fake_progress_risk"],
"next_step": discovery["next_step"],
}
manifest_path.parent.mkdir(parents=True, exist_ok=True)
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
def write_outputs(args: argparse.Namespace, discovery: dict[str, Any]) -> None:
args.output_json.parent.mkdir(parents=True, exist_ok=True)
args.output_json.write_text(
json.dumps(discovery, indent=2, sort_keys=True) + "\n",
encoding="utf-8",
)
write_markdown(discovery, args.markdown)
command = " ".join([Path(sys.argv[0]).as_posix(), *sys.argv[1:]])
write_manifest(
discovery=discovery,
manifest_path=args.manifest,
output_json=args.output_json,
markdown_path=args.markdown,
command=command,
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Discover active BTC up/down Polymarket markets from public Gamma metadata."
)
parser.add_argument("--output-json", type=Path, default=DEFAULT_OUTPUT_JSON)
parser.add_argument("--manifest", type=Path, default=DEFAULT_MANIFEST)
parser.add_argument("--markdown", type=Path, default=DEFAULT_MARKDOWN)
parser.add_argument("--limit", type=int, default=100)
parser.add_argument("--max-pages", type=int, default=3)
parser.add_argument("--timeout", type=float, default=15.0)
parser.add_argument("--min-markets", type=int, default=1)
parser.add_argument("--allow-expired", action="store_true")
parser.add_argument("--allow-non-accepting-orders", action="store_true")
return parser.parse_args()
def main() -> int:
args = parse_args()
discovery = discover(args)
write_outputs(args, discovery)
print(
json.dumps(
{
"status": discovery["summary"]["status"],
"status_reason": discovery["summary"]["status_reason"],
"output_json": args.output_json.as_posix(),
"manifest": args.manifest.as_posix(),
"markdown": args.markdown.as_posix(),
"normalized_market_count": discovery["summary"]["normalized_market_count"],
"markets": [
{
"market_slug": row.get("market_slug"),
"condition_id": row.get("condition_id"),
"token_ids": [token.get("token_id") for token in row.get("tokens", [])],
"end_time_utc": row.get("end_time_utc"),
}
for row in discovery["normalized_markets"]
],
"warnings": discovery["summary"]["warnings"],
},
indent=2,
sort_keys=True,
)
)
return 0 if discovery["summary"]["status"] == "PASS" else 1
if __name__ == "__main__":
raise SystemExit(main())

View file

@ -0,0 +1,466 @@
#!/usr/bin/env bash
set -uo pipefail
NAMESPACE="${ORDERBOOKS_K8S_NAMESPACE:-orderbooks}"
DEPLOYMENT="${ORDERBOOKS_K8S_COLLECTOR_DEPLOYMENT:-orderbooks-collector}"
CRONJOB="${ORDERBOOKS_K8S_UPLOADER_CRONJOB:-orderbooks-uploader}"
RAW_DIR="${ORDERBOOKS_K8S_RAW_DIR:-/var/lib/orderbooks/raw_orderbooks}"
MANIFEST_DIR="${ORDERBOOKS_K8S_MANIFEST_DIR:-/var/lib/orderbooks/manifests}"
WAIT_SECONDS="${ORDERBOOKS_K8S_SMOKE_WAIT_SECONDS:-1200}"
UPLOAD_MIN_AGE_SECONDS="${ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS:-600}"
KUBECTL_BIN="${ORDERBOOKS_KUBECTL:-kubectl}"
RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
EVIDENCE_PATH="${ORDERBOOKS_K8S_SMOKE_EVIDENCE_PATH:-data/manifests/k8s_runtime_smoke_${RUN_ID}.json}"
usage() {
cat <<'EOF'
Usage: scripts/k8s_runtime_smoke_check.sh [options]
Run after the orderbooks Kubernetes workload is deployed. The script uses
kubectl, writes local JSON evidence, deletes one collector pod to force a
Deployment restart, verifies raw gzip JSONL files and manifests on the PVC,
then triggers the uploader CronJob and requires a verified upload manifest.
Options:
--namespace NAME Namespace. Default: orderbooks.
--deployment NAME Collector deployment. Default: orderbooks-collector.
--cronjob NAME Uploader CronJob. Default: orderbooks-uploader.
--raw-dir PATH Raw path inside collector pod. Default: /var/lib/orderbooks/raw_orderbooks.
--manifest-dir PATH Manifest path inside collector pod. Default: /var/lib/orderbooks/manifests.
--wait-seconds N Max wait for collector/upload evidence. Default: 1200.
--upload-min-age-seconds N
Wait for at least one raw/manifest file to be this old before upload. Default: 600.
--evidence-path PATH Local JSON evidence path.
--kubectl PATH kubectl binary. Default: kubectl.
--help Show this help.
This script does not read or print rclone config contents.
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--namespace) NAMESPACE="$2"; shift 2 ;;
--deployment) DEPLOYMENT="$2"; shift 2 ;;
--cronjob) CRONJOB="$2"; shift 2 ;;
--raw-dir) RAW_DIR="$2"; shift 2 ;;
--manifest-dir) MANIFEST_DIR="$2"; shift 2 ;;
--wait-seconds) WAIT_SECONDS="$2"; shift 2 ;;
--upload-min-age-seconds) UPLOAD_MIN_AGE_SECONDS="$2"; shift 2 ;;
--evidence-path) EVIDENCE_PATH="$2"; shift 2 ;;
--kubectl) KUBECTL_BIN="$2"; shift 2 ;;
--help) usage; exit 0 ;;
*) echo "Unknown argument: $1" >&2; usage >&2; exit 2 ;;
esac
done
mkdir -p "$(dirname "${EVIDENCE_PATH}")"
PYTHONDONTWRITEBYTECODE=1 python3 - "$KUBECTL_BIN" "$NAMESPACE" "$DEPLOYMENT" "$CRONJOB" "$RAW_DIR" "$MANIFEST_DIR" "$WAIT_SECONDS" "$UPLOAD_MIN_AGE_SECONDS" "$EVIDENCE_PATH" <<'PY_SMOKE'
import datetime as dt
import json
import subprocess
import sys
import time
from pathlib import Path
kubectl = sys.argv[1]
namespace = sys.argv[2]
deployment = sys.argv[3]
cronjob = sys.argv[4]
raw_dir = sys.argv[5]
manifest_dir = sys.argv[6]
wait_seconds = int(sys.argv[7])
upload_min_age_seconds = int(sys.argv[8])
evidence_path = Path(sys.argv[9])
started_at = dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z')
checks = []
failures = []
def iso_now():
return dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z')
def capture(command, input_text=None, timeout=None):
proc = subprocess.run(command, input=input_text, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
item = {
'command': command,
'exit_code': proc.returncode,
'stdout_tail': proc.stdout[-6000:],
'stderr_tail': proc.stderr[-6000:],
'ran_at_utc': iso_now(),
}
checks.append(item)
return proc, item
def run(command, input_text=None, timeout=None):
_proc, item = capture(command, input_text=input_text, timeout=timeout)
return item
def run_json(command, input_text=None, timeout=None):
proc, item = capture(command, input_text=input_text, timeout=timeout)
if item['exit_code'] != 0:
raise RuntimeError(f"command failed: {' '.join(command)}")
return json.loads(proc.stdout)
def pod_ready(pod):
if pod.get('status', {}).get('phase') != 'Running':
return False
statuses = pod.get('status', {}).get('containerStatuses') or []
return bool(statuses) and all(status.get('ready') for status in statuses)
def get_collector_pod():
selector = 'app.kubernetes.io/name=orderbooks,app.kubernetes.io/component=collector'
deadline = time.time() + wait_seconds
last = None
while time.time() <= deadline:
pods = run_json([kubectl, '-n', namespace, 'get', 'pods', '-l', selector, '-o', 'json'])
items = pods.get('items', [])
ready = [pod for pod in items if pod_ready(pod)]
if ready:
ready.sort(key=lambda pod: pod.get('metadata', {}).get('creationTimestamp', ''))
return ready[-1]['metadata']['name'], ready[-1]
last = items
time.sleep(10)
raise TimeoutError(f'no ready collector pod found; last pods={last}')
def exec_python(pod, code, args):
command = [kubectl, '-n', namespace, 'exec', '-i', pod, '--', 'python3', '-', *args]
proc, item = capture(command, input_text=code, timeout=wait_seconds + 60)
if item['exit_code'] != 0:
raise RuntimeError(f"pod python command failed in {pod}: {item['stderr_tail']}")
return json.loads(proc.stdout)
def wait_for_valid_collector(pod, after_mtime, label):
deadline = time.time() + wait_seconds
last_error = None
while time.time() <= deadline:
try:
result = exec_python(pod, collector_validation_code, [manifest_dir, raw_dir, str(after_mtime)])
if result.get('valid'):
result['wait_label'] = label
return result
last_error = result
except Exception as exc:
last_error = repr(exc)
time.sleep(15)
raise TimeoutError(f'no valid {label} collector manifest found before timeout: {last_error}')
def wait_for_upload_eligible_files(pod):
deadline = time.time() + wait_seconds
last = None
while time.time() <= deadline:
result = exec_python(pod, upload_eligibility_code, [raw_dir, manifest_dir, str(upload_min_age_seconds)])
if result.get('eligible'):
return result
last = result
time.sleep(15)
raise TimeoutError(f'no upload-eligible raw/manifest files before timeout: {last}')
collector_validation_code = r'''
import gzip
import hashlib
import json
import sys
from pathlib import Path
manifest_dir = Path(sys.argv[1])
raw_dir = Path(sys.argv[2])
after_mtime = float(sys.argv[3])
def sha256(path):
digest = hashlib.sha256()
with path.open('rb') as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b''):
digest.update(chunk)
return digest.hexdigest()
def parse_raw(path):
rows = 0
first_keys = []
with gzip.open(path, 'rt', encoding='utf-8') as handle:
for line in handle:
if not line.strip():
continue
obj = json.loads(line)
if rows == 0:
first_keys = sorted(obj.keys())
rows += 1
return rows, first_keys
def validate(path):
manifest = json.loads(path.read_text(encoding='utf-8'))
output_files = []
for item in manifest.get('output_files', []):
raw_path = Path(item['path'])
rows, first_keys = parse_raw(raw_path)
actual_sha = sha256(raw_path)
output_files.append({
'path': str(raw_path),
'bytes': raw_path.stat().st_size,
'mtime': raw_path.stat().st_mtime,
'manifest_rows': item.get('rows'),
'rows_parsed': rows,
'row_count_matches_manifest': rows == item.get('rows'),
'manifest_sha256': item.get('sha256'),
'actual_sha256': actual_sha,
'sha256_matches_manifest': actual_sha == item.get('sha256'),
'under_raw_dir': raw_path.resolve().is_relative_to(raw_dir.resolve()),
'first_row_keys': first_keys,
})
valid = (
manifest.get('gate_status') == 'PASS'
and manifest.get('rows_written', 0) > 0
and manifest.get('failure_count') == 0
and not manifest.get('failures')
and bool(output_files)
and all(item['rows_parsed'] > 0 and item['row_count_matches_manifest'] and item['sha256_matches_manifest'] and item['under_raw_dir'] for item in output_files)
)
return {
'path': str(path),
'mtime': path.stat().st_mtime,
'manifest_summary': {
'gate_status': manifest.get('gate_status'),
'rows_written': manifest.get('rows_written'),
'failure_count': manifest.get('failure_count'),
'failures_present': bool(manifest.get('failures')),
'output_file_count': len(manifest.get('output_files', [])),
'started_at_utc': manifest.get('started_at_utc'),
'ended_at_utc': manifest.get('ended_at_utc'),
},
'output_files': output_files,
'valid': valid,
}
candidates = sorted(manifest_dir.glob('polymarket_orderbook_collector_*.json'), key=lambda p: p.stat().st_mtime)
candidates = [path for path in candidates if path.stat().st_mtime > after_mtime]
latest = None
for path in reversed(candidates):
try:
result = validate(path)
except Exception as exc:
latest = {'path': str(path), 'valid': False, 'error': repr(exc)}
continue
latest = result
if result['valid']:
print(json.dumps(result, sort_keys=True))
sys.exit(0)
print(json.dumps(latest or {'valid': False, 'error': 'no collector manifest candidates'}, sort_keys=True))
sys.exit(2)
'''
raw_check_code = r'''
import gzip
import hashlib
import json
import sys
from pathlib import Path
path = Path(sys.argv[1])
expected_sha = sys.argv[2]
expected_rows = int(sys.argv[3])
def sha256(path):
digest = hashlib.sha256()
with path.open('rb') as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b''):
digest.update(chunk)
return digest.hexdigest()
rows = 0
with gzip.open(path, 'rt', encoding='utf-8') as handle:
for line in handle:
if line.strip():
json.loads(line)
rows += 1
actual_sha = sha256(path)
print(json.dumps({
'path': str(path),
'expected_sha256': expected_sha,
'actual_sha256': actual_sha,
'sha256_matches': actual_sha == expected_sha,
'expected_rows': expected_rows,
'actual_rows': rows,
'row_count_matches': rows == expected_rows,
}, sort_keys=True))
'''
upload_validation_code = r'''
import json
import sys
from pathlib import Path
manifest_dir = Path(sys.argv[1])
after_mtime = float(sys.argv[2])
candidates = sorted(manifest_dir.glob('upload_archive_*.json'), key=lambda p: p.stat().st_mtime)
candidates = [path for path in candidates if path.stat().st_mtime >= after_mtime]
if not candidates:
print(json.dumps({'valid': False, 'error': 'no upload manifest candidates'}, sort_keys=True))
sys.exit(2)
path = candidates[-1]
manifest = json.loads(path.read_text(encoding='utf-8'))
verified_count = manifest.get('counts', {}).get('verified', len(manifest.get('verified_files', [])))
valid = (
manifest.get('operation_status') == 'UPLOAD_VERIFIED'
and manifest.get('gate_status') == 'PASS'
and manifest.get('rclone', {}).get('copy_exit_code') == 0
and manifest.get('rclone', {}).get('check_exit_code') == 0
and verified_count > 0
)
verified_files = manifest.get('verified_files', [])
print(json.dumps({
'path': str(path),
'mtime': path.stat().st_mtime,
'manifest_summary': {
'operation_status': manifest.get('operation_status'),
'gate_status': manifest.get('gate_status'),
'counts': manifest.get('counts', {}),
'planned_file_count': len(manifest.get('planned_files', [])),
'attempted_file_count': len(manifest.get('attempted_files', [])),
'uploaded_file_count': len(manifest.get('uploaded_files', [])),
'verified_file_count': verified_count,
'rclone_copy_exit_code': manifest.get('rclone', {}).get('copy_exit_code'),
'rclone_check_exit_code': manifest.get('rclone', {}).get('check_exit_code'),
'started_at_utc': manifest.get('started_at_utc'),
'ended_at_utc': manifest.get('ended_at_utc'),
},
'verified_count': verified_count,
'verified_file_samples': [
{
'relative_path': item.get('relative_path'),
'bytes': item.get('bytes'),
'sha256': item.get('sha256'),
'kind': item.get('kind'),
}
for item in verified_files[:5]
],
'valid': valid,
}, sort_keys=True))
if not valid:
sys.exit(2)
'''
upload_eligibility_code = r'''
import json
import sys
import time
from pathlib import Path
raw_dir = Path(sys.argv[1])
manifest_dir = Path(sys.argv[2])
min_age_seconds = int(sys.argv[3])
now = time.time()
def eligible_files(root, pattern):
if not root.exists():
return []
items = []
for path in sorted(root.rglob(pattern)):
if not path.is_file():
continue
age = max(0, int(now - path.stat().st_mtime))
if age >= min_age_seconds:
items.append({'path': str(path), 'bytes': path.stat().st_size, 'age_seconds': age})
return items
raw_files = eligible_files(raw_dir, '*.jsonl.gz')
manifest_files = eligible_files(manifest_dir, 'polymarket_orderbook_collector_*.json')
print(json.dumps({
'eligible': bool(raw_files) and bool(manifest_files),
'min_age_seconds': min_age_seconds,
'raw_eligible_count': len(raw_files),
'manifest_eligible_count': len(manifest_files),
'raw_sample': raw_files[:3],
'manifest_sample': manifest_files[:3],
}, sort_keys=True))
'''
summary = {
'schema_name': 'k8s_runtime_smoke_result',
'schema_version': 1,
'started_at_utc': started_at,
'ended_at_utc': None,
'gate_status': 'ERROR',
'production_ready': False,
'namespace': namespace,
'deployment': deployment,
'cronjob': cronjob,
'raw_dir': raw_dir,
'manifest_dir': manifest_dir,
'upload_min_age_seconds': upload_min_age_seconds,
'checks': checks,
'failures': failures,
}
try:
rollout = run([kubectl, '-n', namespace, 'rollout', 'status', f'deployment/{deployment}', f'--timeout={wait_seconds}s'])
if rollout['exit_code'] != 0:
raise RuntimeError('collector deployment rollout is not healthy')
pod_name, pod_obj = get_collector_pod()
before = wait_for_valid_collector(pod_name, 0, 'initial')
before_mtime = before['mtime']
old_file = before['output_files'][0]
delete_pod = run([kubectl, '-n', namespace, 'delete', 'pod', pod_name, '--wait=false'])
if delete_pod['exit_code'] != 0:
raise RuntimeError('failed to delete collector pod for restart test')
rollout_after = run([kubectl, '-n', namespace, 'rollout', 'status', f'deployment/{deployment}', f'--timeout={wait_seconds}s'])
if rollout_after['exit_code'] != 0:
raise RuntimeError('collector deployment did not recover after pod delete')
new_pod, new_pod_obj = get_collector_pod()
old_check = exec_python(new_pod, raw_check_code, [old_file['path'], old_file['actual_sha256'], str(old_file['rows_parsed'])])
if not old_check.get('sha256_matches') or not old_check.get('row_count_matches'):
raise RuntimeError('old raw file changed or stopped parsing after pod restart')
after = wait_for_valid_collector(new_pod, before_mtime, 'post_restart')
upload_eligibility = wait_for_upload_eligible_files(new_pod)
upload_start_mtime = time.time() - 2
job_name = 'orderbooks-uploader-smoke-' + dt.datetime.now(dt.UTC).strftime('%Y%m%dt%H%M%Sz').lower()
run([kubectl, '-n', namespace, 'delete', 'job', job_name, '--ignore-not-found=true'])
create_job = run([kubectl, '-n', namespace, 'create', 'job', job_name, f'--from=cronjob/{cronjob}'])
if create_job['exit_code'] != 0:
raise RuntimeError('failed to create uploader smoke job from CronJob')
wait_upload = run([kubectl, '-n', namespace, 'wait', '--for=condition=Complete', f'--timeout={wait_seconds}s', f'job/{job_name}'])
logs = run([kubectl, '-n', namespace, 'logs', f'job/{job_name}'])
if wait_upload['exit_code'] != 0:
raise RuntimeError('uploader smoke job did not complete')
upload = exec_python(new_pod, upload_validation_code, [manifest_dir, str(upload_start_mtime)])
if not upload.get('valid'):
raise RuntimeError('upload manifest did not verify at least one file')
summary.update({
'initial_collector_pod': pod_name,
'post_restart_collector_pod': new_pod,
'before_restart_collector': before,
'old_raw_file_after_restart': old_check,
'after_restart_collector': after,
'upload_eligibility': upload_eligibility,
'uploader_job': job_name,
'upload_result': upload,
'uploader_log_check_exit_code': logs['exit_code'],
})
summary['gate_status'] = 'PASS'
except Exception as exc:
failures.append(str(exc))
summary['exception'] = repr(exc)
summary['gate_status'] = 'FAIL'
finally:
summary['ended_at_utc'] = iso_now()
evidence_path.parent.mkdir(parents=True, exist_ok=True)
evidence_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + '\n', encoding='utf-8')
print(f'K8S_SMOKE_EVIDENCE={evidence_path}')
print(f'K8S_SMOKE_GATE={summary["gate_status"]}')
if summary['gate_status'] != 'PASS':
sys.exit(1)
PY_SMOKE

View file

@ -0,0 +1,496 @@
#!/usr/bin/env python3
"""Normalize raw Polymarket order-book snapshots from the sample collector.
Checkpoint 5 scope: derive a bounded normalized gzip JSONL sample from the raw
Checkpoint 4 sample. Raw files remain the source of truth; every normalized row
keeps the raw file path and gzip JSONL line number.
"""
from __future__ import annotations
import argparse
import datetime as dt
import gzip
import hashlib
import json
import sys
from decimal import Decimal, InvalidOperation, getcontext
from pathlib import Path
from typing import Any
NORMALIZER_NAME = "polymarket_orderbook_normalizer"
NORMALIZER_VERSION = "0.1.0"
SCHEMA_NAME = "normalized_orderbook_snapshot"
SCHEMA_VERSION = 1
DEFAULT_INPUT_MANIFEST = Path("data/manifests/orderbook_collector_sample_manifest.json")
DEFAULT_OUTPUT_DIR = Path("data/normalized_sample")
DEFAULT_MANIFEST_PATH = Path("data/manifests/orderbook_normalization_sample_manifest.json")
CENT_OFFSETS = {
"1c": Decimal("0.01"),
"2c": Decimal("0.02"),
"5c": Decimal("0.05"),
}
SECRET_PATTERNS = (
"set-" "coo" "kie",
"__cf" "_bm",
"cf" "_bm",
"author" "ization",
"private" "_key",
"api" "_secret",
"poly" "_signature",
"poly" "_passphrase",
"poly" "_address",
"bear" "er",
"coo" "kie",
"wallet" " material",
)
getcontext().prec = 50
def utc_now() -> dt.datetime:
return dt.datetime.now(dt.UTC)
def iso_z(value: dt.datetime | None = None) -> str:
value = value or utc_now()
return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
def compact_timestamp(value: dt.datetime | None = None) -> str:
value = value or utc_now()
return value.astimezone(dt.UTC).strftime("%Y%m%dT%H%M%SZ")
def sha256_file(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def decimal_from_raw(value: Any, field_name: str) -> Decimal:
if not isinstance(value, str):
raise ValueError(f"{field_name} is not a string: {value!r}")
try:
parsed = Decimal(value)
except InvalidOperation as exc:
raise ValueError(f"{field_name} is not a decimal: {value!r}") from exc
if not parsed.is_finite():
raise ValueError(f"{field_name} is not finite: {value!r}")
return parsed
def decimal_to_json(value: Decimal | None) -> str | None:
if value is None:
return None
if value == 0:
return "0"
return format(value.normalize(), "f")
def load_json(path: Path) -> dict[str, Any]:
with path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
if not isinstance(data, dict):
raise ValueError(f"{path} did not contain a JSON object")
return data
def resolve_repo_path(path_text: str) -> Path:
path = Path(path_text)
if path.is_absolute():
return path
return Path.cwd() / path
def normalize_side(levels: Any, side_name: str) -> list[tuple[Decimal, Decimal]]:
if not isinstance(levels, list):
raise ValueError(f"raw.{side_name} is not a list")
normalized: list[tuple[Decimal, Decimal]] = []
for index, level in enumerate(levels):
if not isinstance(level, dict):
raise ValueError(f"raw.{side_name}[{index}] is not an object")
price = decimal_from_raw(level.get("price"), f"raw.{side_name}[{index}].price")
size = decimal_from_raw(level.get("size"), f"raw.{side_name}[{index}].size")
if size < 0:
raise ValueError(f"raw.{side_name}[{index}].size is negative")
normalized.append((price, size))
return normalized
def sum_sizes(levels: list[tuple[Decimal, Decimal]]) -> Decimal:
return sum((size for _, size in levels), Decimal("0"))
def normalize_raw_row(raw_row: dict[str, Any], raw_file: str, raw_line_number: int) -> dict[str, Any]:
raw_book = raw_row.get("raw")
market = raw_row.get("market")
collection = raw_row.get("collection")
if not isinstance(raw_book, dict):
raise ValueError("raw is not an object")
if not isinstance(market, dict):
raise ValueError("market is not an object")
if not isinstance(collection, dict):
raise ValueError("collection is not an object")
bids = normalize_side(raw_book.get("bids"), "bids")
asks = normalize_side(raw_book.get("asks"), "asks")
best_bid = max((price for price, _ in bids), default=None)
best_ask = min((price for price, _ in asks), default=None)
spread = None
midpoint = None
if best_bid is not None and best_ask is not None:
spread = best_ask - best_bid
midpoint = (best_bid + best_ask) / Decimal("2")
bid_depth_total = sum_sizes(bids)
ask_depth_total = sum_sizes(asks)
row: dict[str, Any] = {
"schema_name": SCHEMA_NAME,
"schema_version": SCHEMA_VERSION,
"market_name": market.get("market_name"),
"market_slug": market.get("market_slug"),
"condition_id": market.get("condition_id"),
"token_id": market.get("token_id"),
"outcome": market.get("outcome"),
"collected_at_utc": collection.get("collected_at_utc"),
"best_bid": decimal_to_json(best_bid),
"best_ask": decimal_to_json(best_ask),
"spread": decimal_to_json(spread),
"midpoint": decimal_to_json(midpoint),
"bid_depth_total": decimal_to_json(bid_depth_total),
"ask_depth_total": decimal_to_json(ask_depth_total),
"raw_file": raw_file,
"raw_line_number": raw_line_number,
}
for label, offset in CENT_OFFSETS.items():
bid_depth = Decimal("0")
if best_bid is not None:
threshold = best_bid - offset
bid_depth = sum((size for price, size in bids if price >= threshold), Decimal("0"))
ask_depth = Decimal("0")
if best_ask is not None:
threshold = best_ask + offset
ask_depth = sum((size for price, size in asks if price <= threshold), Decimal("0"))
row[f"bid_depth_within_{label}"] = decimal_to_json(bid_depth)
row[f"ask_depth_within_{label}"] = decimal_to_json(ask_depth)
return row
def summarize_output(path: Path, rows: int) -> dict[str, Any]:
return {
"path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path),
"rows": rows,
"bytes": path.stat().st_size,
"sha256": sha256_file(path),
"status": "valid",
}
def build_input_file_summary(manifest: dict[str, Any]) -> list[dict[str, Any]]:
files = manifest.get("output_files")
if not isinstance(files, list) or not files:
raise ValueError("input manifest has no output_files")
summaries: list[dict[str, Any]] = []
for file_entry in files:
if not isinstance(file_entry, dict):
raise ValueError("input manifest output_files entry is not an object")
path_text = file_entry.get("path")
if not isinstance(path_text, str) or not path_text:
raise ValueError("input manifest output_files entry lacks path")
path = resolve_repo_path(path_text)
if not path.exists():
raise FileNotFoundError(path)
actual_sha = sha256_file(path)
expected_sha = file_entry.get("sha256")
checksum_match = expected_sha == actual_sha
summaries.append(
{
"path": path_text,
"rows_expected": file_entry.get("rows"),
"bytes": path.stat().st_size,
"sha256": actual_sha,
"input_manifest_sha256": expected_sha,
"checksum_match": checksum_match,
"status": "valid" if checksum_match else "invalid",
}
)
return summaries
def read_and_normalize(
input_files: list[dict[str, Any]],
output_path: Path,
) -> tuple[int, int, list[dict[str, Any]], dict[str, Any]]:
raw_rows_read = 0
normalized_rows_written = 0
errors: list[dict[str, Any]] = []
sanity = {
"raw_file_refs_present": True,
"raw_files_exist": True,
"spread_non_negative": True,
"midpoint_between_bid_ask": True,
"depth_totals_non_negative": True,
"outcomes_seen": [],
"gzip_jsonl_parseable": True,
"row_count_match": None,
}
outcomes_seen: set[str] = set()
output_path.parent.mkdir(parents=True, exist_ok=True)
with gzip.open(output_path, "wt", encoding="utf-8", compresslevel=9) as output:
for file_entry in input_files:
raw_file = file_entry["path"]
raw_path = resolve_repo_path(raw_file)
if not raw_path.exists():
sanity["raw_files_exist"] = False
errors.append({"raw_file": raw_file, "error": "raw file missing"})
continue
with gzip.open(raw_path, "rt", encoding="utf-8") as raw_handle:
for raw_line_number, line in enumerate(raw_handle, 1):
raw_rows_read += 1
try:
raw_row = json.loads(line)
normalized = normalize_raw_row(raw_row, raw_file, raw_line_number)
output.write(json.dumps(normalized, sort_keys=True, separators=(",", ":")) + "\n")
normalized_rows_written += 1
if not normalized.get("raw_file") or not normalized.get("raw_line_number"):
sanity["raw_file_refs_present"] = False
if not resolve_repo_path(str(normalized["raw_file"])).exists():
sanity["raw_files_exist"] = False
outcome = normalized.get("outcome")
if isinstance(outcome, str):
outcomes_seen.add(outcome)
best_bid = Decimal(normalized["best_bid"]) if normalized["best_bid"] is not None else None
best_ask = Decimal(normalized["best_ask"]) if normalized["best_ask"] is not None else None
spread = Decimal(normalized["spread"]) if normalized["spread"] is not None else None
midpoint = Decimal(normalized["midpoint"]) if normalized["midpoint"] is not None else None
if best_bid is not None and best_ask is not None:
if spread is None or spread < 0:
sanity["spread_non_negative"] = False
if midpoint is None or midpoint < best_bid or midpoint > best_ask:
sanity["midpoint_between_bid_ask"] = False
depth_fields = [
"bid_depth_total",
"ask_depth_total",
"bid_depth_within_1c",
"ask_depth_within_1c",
"bid_depth_within_2c",
"ask_depth_within_2c",
"bid_depth_within_5c",
"ask_depth_within_5c",
]
for field in depth_fields:
if Decimal(normalized[field]) < 0:
sanity["depth_totals_non_negative"] = False
except Exception as exc: # noqa: BLE001 - preserve row-level failure evidence.
errors.append(
{
"raw_file": raw_file,
"raw_line_number": raw_line_number,
"error": str(exc),
}
)
sanity["outcomes_seen"] = sorted(outcomes_seen)
sanity["has_up_and_down"] = {"Up", "Down"}.issubset(outcomes_seen)
sanity["row_count_match"] = raw_rows_read == normalized_rows_written + len(errors)
return raw_rows_read, normalized_rows_written, errors, sanity
def validate_output_gzip_jsonl(path: Path) -> tuple[bool, int, list[str]]:
errors: list[str] = []
parsed_rows = 0
try:
with gzip.open(path, "rt", encoding="utf-8") as handle:
for line_number, line in enumerate(handle, 1):
json.loads(line)
parsed_rows = line_number
except Exception as exc: # noqa: BLE001 - validation result belongs in manifest.
errors.append(str(exc))
return not errors, parsed_rows, errors
def scan_for_secret_terms(paths: list[Path]) -> dict[str, Any]:
matches: list[dict[str, Any]] = []
lowered_patterns = tuple(pattern.lower() for pattern in SECRET_PATTERNS)
for path in paths:
if not path.exists():
continue
if path.suffix == ".gz":
opener = gzip.open
else:
opener = open
with opener(path, "rt", encoding="utf-8", errors="replace") as handle: # type: ignore[arg-type]
for line_number, line in enumerate(handle, 1):
lower = line.lower()
for pattern_index, pattern in enumerate(lowered_patterns, 1):
if pattern in lower:
matches.append(
{
"path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path),
"line_number": line_number,
"term_index": pattern_index,
}
)
break
return {
"passed": not matches,
"checked_term_count": len(SECRET_PATTERNS),
"matches": matches,
}
def parse_args(argv: list[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Normalize Checkpoint 4 raw Polymarket order-book snapshots.",
)
parser.add_argument(
"--input-manifest",
type=Path,
default=DEFAULT_INPUT_MANIFEST,
help=f"Raw collector manifest path. Default: {DEFAULT_INPUT_MANIFEST}",
)
parser.add_argument(
"--output-dir",
type=Path,
default=DEFAULT_OUTPUT_DIR,
help=f"Normalized sample base directory. Default: {DEFAULT_OUTPUT_DIR}",
)
parser.add_argument(
"--manifest-path",
type=Path,
default=DEFAULT_MANIFEST_PATH,
help=f"Normalization manifest path. Default: {DEFAULT_MANIFEST_PATH}",
)
return parser.parse_args(argv)
def main(argv: list[str]) -> int:
args = parse_args(argv)
started = utc_now()
input_manifest = load_json(args.input_manifest)
input_files = build_input_file_summary(input_manifest)
run_id = compact_timestamp(started)
output_path = (
args.output_dir
/ "polymarket"
/ "orderbooks"
/ run_id
/ f"polymarket_orderbooks_normalized_{run_id}.jsonl.gz"
)
raw_rows_read, normalized_rows_written, row_errors, sanity = read_and_normalize(input_files, output_path)
gzip_ok, gzip_rows, gzip_errors = validate_output_gzip_jsonl(output_path)
output_summary = summarize_output(output_path, normalized_rows_written)
sanity.update(
{
"output_row_count_equals_raw_input_row_count": normalized_rows_written == raw_rows_read
if not row_errors
else False,
"gzip_jsonl_decompresses_and_parses": gzip_ok,
"gzip_jsonl_rows_parsed": gzip_rows,
"gzip_jsonl_errors": gzip_errors,
"manifest_checksum_matches_output": output_summary["sha256"] == sha256_file(output_path),
"all_input_file_checksums_match": all(file_entry["checksum_match"] for file_entry in input_files),
}
)
secret_scan = scan_for_secret_terms([Path(__file__), output_path])
sanity["checkpoint5_secret_scan_passed"] = secret_scan["passed"]
gate_checks = [
normalized_rows_written == raw_rows_read,
not row_errors,
sanity["raw_file_refs_present"],
sanity["raw_files_exist"],
sanity["spread_non_negative"],
sanity["midpoint_between_bid_ask"],
sanity["depth_totals_non_negative"],
sanity["has_up_and_down"],
gzip_ok,
sanity["manifest_checksum_matches_output"],
secret_scan["passed"],
all(file_entry["checksum_match"] for file_entry in input_files),
]
gate_status = "PASS" if all(gate_checks) and normalized_rows_written > 0 else "FAIL"
ended = utc_now()
manifest = {
"schema_name": "orderbook_normalization_sample_manifest",
"schema_version": 1,
"checkpoint_id": 5,
"checkpoint_name": "Normalized Snapshot Extract",
"normalizer": {
"name": NORMALIZER_NAME,
"version": NORMALIZER_VERSION,
},
"started_at_utc": iso_z(started),
"ended_at_utc": iso_z(ended),
"run_duration_seconds": round((ended - started).total_seconds(), 3),
"command": "scripts/normalize_polymarket_orderbooks.py",
"input_manifest": {
"path": str(args.input_manifest),
"sha256": sha256_file(args.input_manifest),
"collector_manifest_schema_name": input_manifest.get("schema_name"),
"collector_gate_status": input_manifest.get("gate_status"),
},
"input_files": input_files,
"output_files": [output_summary],
"raw_rows_read": raw_rows_read,
"normalized_rows_written": normalized_rows_written,
"skipped_rows": len(row_errors),
"error_rows": row_errors,
"numeric_encoding": "Exact decimal values are emitted as JSON strings; missing price-derived values are null.",
"sanity_checks": sanity,
"secret_scan": secret_scan,
"warnings": [],
"known_gaps": [
"This is a derived sample extract only; raw gzip JSONL remains the source of truth.",
"No upload, daemon runtime, systemd unit, dashboard, database, strategy, backtest, or trading behavior is included.",
"The sample proves normalization logic on one bounded raw run, not long-run schema stability.",
],
"fake_progress_risk": "A clean normalized sample can hide raw collection gaps and endpoint schema drift; every row is therefore traceable to raw_file and raw_line_number, and reliability remains gated on later soak testing.",
"next_step": "Checkpoint 6 should package the raw collector for a VPS runtime, or the orchestrator can request review of this normalized sample first.",
"gate_status": gate_status,
}
args.manifest_path.parent.mkdir(parents=True, exist_ok=True)
args.manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
print(
json.dumps(
{
"gate_status": gate_status,
"manifest_path": str(args.manifest_path),
"output_path": str(output_path),
"raw_rows_read": raw_rows_read,
"normalized_rows_written": normalized_rows_written,
"skipped_rows": len(row_errors),
"sha256": output_summary["sha256"],
},
indent=2,
sort_keys=True,
)
)
return 0 if gate_status == "PASS" else 1
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,362 @@
#!/usr/bin/env bash
set -uo pipefail
APP_DIR="${ORDERBOOKS_APP_DIR:-$(pwd)}"
PYTHON_BIN="${ORDERBOOKS_PYTHON:-python3}"
RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-/usr/bin/rclone}"
RCLONE_DEST_BASE="${ORDERBOOKS_RCLONE_DEST:-gdrive:orderbooks/polymarket/soak-test}"
SOAK_DATE="${ORDERBOOKS_SOAK_DATE:-$(date -u +%F)}"
SOAK_ID="${ORDERBOOKS_SOAK_ID:-soak_test_${SOAK_DATE}}"
SOAK_SECONDS="${ORDERBOOKS_SOAK_SECONDS:-86400}"
CYCLE_SECONDS="${ORDERBOOKS_SOAK_CYCLE_SECONDS:-300}"
INTERVAL_SECONDS="${ORDERBOOKS_SOAK_INTERVAL_SECONDS:-30}"
MARKET_LIMIT="${ORDERBOOKS_SOAK_MARKET_LIMIT:-2}"
MARKET_END_SAFETY_SECONDS="${ORDERBOOKS_SOAK_MARKET_END_SAFETY_SECONDS:-420}"
REQUEST_TIMEOUT_SECONDS="${ORDERBOOKS_SOAK_REQUEST_TIMEOUT_SECONDS:-15}"
MAX_RETRIES="${ORDERBOOKS_SOAK_MAX_RETRIES:-2}"
BACKOFF_SECONDS="${ORDERBOOKS_SOAK_BACKOFF_SECONDS:-2}"
DISCOVERY_LIMIT="${ORDERBOOKS_SOAK_DISCOVERY_LIMIT:-100}"
DISCOVERY_MAX_PAGES="${ORDERBOOKS_SOAK_DISCOVERY_MAX_PAGES:-3}"
DISCOVERY_TIMEOUT="${ORDERBOOKS_SOAK_DISCOVERY_TIMEOUT:-15}"
LOCAL_ROOT="${ORDERBOOKS_SOAK_LOCAL_ROOT:-data/soak_test/${SOAK_DATE}}"
MANIFEST_ROOT="${ORDERBOOKS_SOAK_MANIFEST_ROOT:-data/manifests/${SOAK_ID}}"
START_MANIFEST="${ORDERBOOKS_SOAK_START_MANIFEST:-data/manifests/${SOAK_ID}_start.json}"
FINAL_MANIFEST="${ORDERBOOKS_SOAK_FINAL_MANIFEST:-data/manifests/${SOAK_ID}_final.json}"
DISCOVERY_DIR="${LOCAL_ROOT}/discovery"
LIVE_DIR="${LOCAL_ROOT}/live_sample"
LOG_DIR="${LOCAL_ROOT}/logs"
PID_FILE="${LOCAL_ROOT}/soak.pid"
CYCLES_JSONL="${MANIFEST_ROOT}/cycles.jsonl"
LOG_FILE="${LOG_DIR}/soak.log"
REMOTE_DEST="${RCLONE_DEST_BASE%/}/${SOAK_DATE}"
STOP_REQUESTED=0
STOP_SIGNAL=""
CURRENT_CHILD_PID=""
CURRENT_PHASE="initializing"
CURRENT_CYCLE_ID=""
START_WRITTEN=0
FINAL_WRITTEN=0
cd "${APP_DIR}" || exit 2
mkdir -p "${DISCOVERY_DIR}" "${LIVE_DIR}" "${LOG_DIR}" "${MANIFEST_ROOT}" "$(dirname "${START_MANIFEST}")" "$(dirname "${FINAL_MANIFEST}")"
STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
START_EPOCH="$(date -u +%s)"
END_EPOCH="$((START_EPOCH + SOAK_SECONDS))"
EXPECTED_COMPLETION_AT="$(date -u -d "@${END_EPOCH}" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || python3 - <<PY
import datetime as dt
print(dt.datetime.fromtimestamp(${END_EPOCH}, dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z"))
PY
)"
safe_log() {
printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" >> "${LOG_FILE}" 2>/dev/null || true
}
log() {
printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" | tee -a "${LOG_FILE}"
}
handle_signal() {
local signal_name="$1"
STOP_REQUESTED=1
STOP_SIGNAL="${signal_name}"
safe_log "SIGNAL received=${signal_name} phase=${CURRENT_PHASE} cycle_id=${CURRENT_CYCLE_ID:-none}"
if [[ -n "${CURRENT_CHILD_PID}" ]] && kill -0 "${CURRENT_CHILD_PID}" 2>/dev/null; then
case "${signal_name}" in
SIGINT) kill -INT "${CURRENT_CHILD_PID}" 2>/dev/null || true ;;
SIGTERM) kill -TERM "${CURRENT_CHILD_PID}" 2>/dev/null || true ;;
SIGHUP) kill -HUP "${CURRENT_CHILD_PID}" 2>/dev/null || true ;;
esac
fi
}
write_start_manifest() {
local tmp_path="${START_MANIFEST}.tmp"
python3 - "$tmp_path" "$START_MANIFEST" <<PY
import json
import os
import pathlib
tmp_path = pathlib.Path(os.sys.argv[1])
final_path = pathlib.Path(os.sys.argv[2])
manifest = {
"schema_name": "soak_test_start_manifest",
"schema_version": 1,
"checkpoint_id": 8,
"checkpoint_name": "24h Soak Test Plan",
"status": "STARTED",
"started_at_utc": "${STARTED_AT}",
"expected_completion_at_utc": "${EXPECTED_COMPLETION_AT}",
"soak_seconds": int("${SOAK_SECONDS}"),
"cycle_seconds": int("${CYCLE_SECONDS}"),
"pid": int("$$"),
"pid_file": "${PID_FILE}",
"log_file": "${LOG_FILE}",
"local_root": "${LOCAL_ROOT}",
"manifest_root": "${MANIFEST_ROOT}",
"remote_dest": "${REMOTE_DEST}",
"raw_output_dir": "${LIVE_DIR}",
"discovery_dir": "${DISCOVERY_DIR}",
"cycles_jsonl": "${CYCLES_JSONL}",
"gate_status": "IN_PROGRESS",
"production_ready": False,
"notes": [
"This is a real 24h soak start marker, not a completion report.",
"Checkpoint 8 cannot pass until 24 real hours elapse and final metrics are validated.",
],
}
tmp_path.parent.mkdir(parents=True, exist_ok=True)
tmp_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
os.replace(tmp_path, final_path)
PY
START_WRITTEN=1
}
write_cycle_record() {
local record="$1"
printf '%s\n' "${record}" >> "${CYCLES_JSONL}"
}
write_final_manifest() {
local final_status="$1"
local gate_status="$2"
local exit_reason="$3"
local ended_at
local tmp_path
ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
tmp_path="${FINAL_MANIFEST}.tmp"
python3 - "$tmp_path" "$FINAL_MANIFEST" <<PY
import json
import os
import pathlib
tmp_path = pathlib.Path(os.sys.argv[1])
final_path = pathlib.Path(os.sys.argv[2])
cycles_path = pathlib.Path("${CYCLES_JSONL}")
cycles = []
if cycles_path.exists():
cycles = [json.loads(line) for line in cycles_path.read_text(encoding="utf-8").splitlines() if line.strip()]
manifest = {
"schema_name": "soak_test_final_manifest",
"schema_version": 1,
"checkpoint_id": 8,
"checkpoint_name": "24h Soak Test Plan",
"status": "${final_status}",
"gate_status": "${gate_status}",
"exit_reason": "${exit_reason}",
"started_at_utc": "${STARTED_AT}",
"ended_at_utc": "${ended_at}",
"expected_completion_at_utc": "${EXPECTED_COMPLETION_AT}",
"soak_seconds": int("${SOAK_SECONDS}"),
"cycle_seconds": int("${CYCLE_SECONDS}"),
"cycles": cycles,
"cycle_count": len(cycles),
"ok_cycle_count": sum(1 for cycle in cycles if cycle.get("status") == "OK"),
"error_cycle_count": sum(1 for cycle in cycles if cycle.get("status") == "ERROR"),
"interrupted_cycle_count": sum(1 for cycle in cycles if cycle.get("status") == "INTERRUPTED"),
"pid": int("$$"),
"pid_file": "${PID_FILE}",
"log_file": "${LOG_FILE}",
"local_root": "${LOCAL_ROOT}",
"manifest_root": "${MANIFEST_ROOT}",
"remote_dest": "${REMOTE_DEST}",
"stop_requested": bool(int("${STOP_REQUESTED}")),
"stop_signal": "${STOP_SIGNAL}",
"current_phase_at_exit": "${CURRENT_PHASE}",
"current_cycle_id_at_exit": "${CURRENT_CYCLE_ID}",
"production_ready": False,
"notes": [
"This marker is written by the soak controller on completion, interruption, or error.",
"Checkpoint 8 cannot be PASS until 24 real hours elapse and final metrics are validated.",
],
}
tmp_path.parent.mkdir(parents=True, exist_ok=True)
tmp_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
os.replace(tmp_path, final_path)
PY
FINAL_WRITTEN=1
}
cleanup_on_exit() {
local rc=$?
if [[ "${START_WRITTEN}" -eq 1 && "${FINAL_WRITTEN}" -eq 0 ]]; then
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
write_final_manifest "INTERRUPTED" "INTERRUPTED" "${STOP_SIGNAL:-signal}"
elif [[ "${rc}" -ne 0 ]]; then
write_final_manifest "ERROR" "ERROR" "exit_code_${rc}"
else
write_final_manifest "ERROR" "ERROR" "exited_without_final_marker"
fi
fi
if [[ -f "${PID_FILE}" ]] && [[ "$(cat "${PID_FILE}" 2>/dev/null)" == "$$" ]]; then
rm -f "${PID_FILE}"
fi
exit "${rc}"
}
run_logged() {
"$@" >> "${LOG_FILE}" 2>&1 &
CURRENT_CHILD_PID="$!"
wait "${CURRENT_CHILD_PID}"
local rc=$?
if [[ "${STOP_REQUESTED}" -eq 1 ]] && kill -0 "${CURRENT_CHILD_PID}" 2>/dev/null; then
wait "${CURRENT_CHILD_PID}"
rc=$?
fi
CURRENT_CHILD_PID=""
return "${rc}"
}
trap 'handle_signal SIGINT' INT
trap 'handle_signal SIGTERM' TERM
trap 'handle_signal SIGHUP' HUP
trap cleanup_on_exit EXIT
echo "$$" > "${PID_FILE}"
write_start_manifest
test -s "${START_MANIFEST}" || exit 3
log "START soak_id=${SOAK_ID} pid=$$ expected_completion=${EXPECTED_COMPLETION_AT}"
cycle_index=0
error_seen=0
while true; do
now_epoch="$(date -u +%s)"
remaining="$((END_EPOCH - now_epoch))"
if [[ "${remaining}" -le 0 ]]; then
break
fi
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
break
fi
if [[ "${remaining}" -lt 30 ]]; then
log "SKIP final tiny remaining window seconds=${remaining}"
break
fi
cycle_index="$((cycle_index + 1))"
cycle_id="$(date -u +%Y%m%dT%H%M%SZ)"
CURRENT_CYCLE_ID="${cycle_id}"
run_seconds="${CYCLE_SECONDS}"
if [[ "${remaining}" -lt "${run_seconds}" ]]; then
run_seconds="${remaining}"
fi
discovery_json="${DISCOVERY_DIR}/polymarket_btc_markets_${cycle_id}.json"
discovery_manifest="${DISCOVERY_DIR}/polymarket_btc_markets_manifest_${cycle_id}.json"
discovery_markdown="${DISCOVERY_DIR}/polymarket_btc_markets_${cycle_id}.md"
collector_manifest="${MANIFEST_ROOT}/collector_${cycle_id}.json"
upload_manifest="${MANIFEST_ROOT}/upload_${cycle_id}.json"
cycle_started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
log "CYCLE ${cycle_index} start id=${cycle_id} run_seconds=${run_seconds}"
discovery_exit=0
CURRENT_PHASE="discovery"
run_logged "${PYTHON_BIN}" scripts/discover_polymarket_btc_markets.py \
--output-json "${discovery_json}" \
--manifest "${discovery_manifest}" \
--markdown "${discovery_markdown}" \
--limit "${DISCOVERY_LIMIT}" \
--max-pages "${DISCOVERY_MAX_PAGES}" \
--timeout "${DISCOVERY_TIMEOUT}" || discovery_exit=$?
collector_exit=0
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
collector_exit=98
elif [[ "${discovery_exit}" -eq 0 ]]; then
CURRENT_PHASE="collector"
run_logged "${PYTHON_BIN}" scripts/collect_polymarket_orderbooks.py \
--config config/polymarket_collector.vps.example.yaml \
--discovery-path "${discovery_json}" \
--output-dir "${LIVE_DIR}" \
--manifest-path "${collector_manifest}" \
--market-limit "${MARKET_LIMIT}" \
--interval-seconds "${INTERVAL_SECONDS}" \
--duration-seconds "${run_seconds}" \
--request-timeout-seconds "${REQUEST_TIMEOUT_SECONDS}" \
--max-retries "${MAX_RETRIES}" \
--backoff-seconds "${BACKOFF_SECONDS}" \
--market-end-safety-seconds "${MARKET_END_SAFETY_SECONDS}" || collector_exit=$?
else
collector_exit=99
fi
upload_exit=0
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
upload_exit=98
elif [[ "${collector_exit}" -eq 0 ]]; then
CURRENT_PHASE="upload"
run_logged scripts/upload_archive_rclone.sh \
--execute \
--data-dir "${LOCAL_ROOT}" \
--raw-dir "${LIVE_DIR}" \
--source-manifest-dir "${MANIFEST_ROOT}" \
--manifest-dir "${MANIFEST_ROOT}" \
--manifest-path "${upload_manifest}" \
--dest "${REMOTE_DEST}" \
--min-age-seconds 0 \
--rclone-bin "${RCLONE_BIN}" || upload_exit=$?
else
upload_exit=99
fi
cycle_ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
cycle_status="INTERRUPTED"
elif [[ "${discovery_exit}" -eq 0 && "${collector_exit}" -eq 0 && "${upload_exit}" -eq 0 ]]; then
cycle_status="OK"
else
cycle_status="ERROR"
error_seen=1
fi
record="$(python3 - <<PY
import json
print(json.dumps({
"cycle_index": ${cycle_index},
"cycle_id": "${cycle_id}",
"started_at_utc": "${cycle_started_at}",
"ended_at_utc": "${cycle_ended_at}",
"run_seconds": int("${run_seconds}"),
"discovery_manifest": "${discovery_manifest}",
"collector_manifest": "${collector_manifest}",
"upload_manifest": "${upload_manifest}",
"discovery_exit": int("${discovery_exit}"),
"collector_exit": int("${collector_exit}"),
"upload_exit": int("${upload_exit}"),
"status": "${cycle_status}",
"stop_signal": "${STOP_SIGNAL}",
}, sort_keys=True))
PY
)"
write_cycle_record "${record}"
log "CYCLE ${cycle_index} end id=${cycle_id} status=${cycle_status} discovery_exit=${discovery_exit} collector_exit=${collector_exit} upload_exit=${upload_exit}"
CURRENT_PHASE="sleep"
CURRENT_CYCLE_ID=""
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
break
fi
sleep 5 &
CURRENT_CHILD_PID="$!"
wait "${CURRENT_CHILD_PID}" || true
CURRENT_CHILD_PID=""
done
CURRENT_PHASE="finalizing"
CURRENT_CYCLE_ID=""
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
write_final_manifest "INTERRUPTED" "INTERRUPTED" "${STOP_SIGNAL:-signal}"
elif [[ "${error_seen}" -eq 1 ]]; then
write_final_manifest "ERROR" "ERROR" "cycle_error"
else
write_final_manifest "COMPLETED_NEEDS_REVIEW" "NEEDS_REVIEW" "elapsed"
fi
log "END soak_id=${SOAK_ID} final_manifest=${FINAL_MANIFEST} status_written=1"

View file

@ -0,0 +1,39 @@
#!/usr/bin/env bash
set -euo pipefail
APP_DIR="${ORDERBOOKS_APP_DIR:-/opt/orderbooks}"
PYTHON_BIN="${ORDERBOOKS_PYTHON:-${APP_DIR}/.venv/bin/python}"
DATA_DIR="${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}"
COLLECTOR_CONFIG="${ORDERBOOKS_COLLECTOR_CONFIG:-/etc/orderbooks/polymarket_collector.vps.yaml}"
DISCOVERY_DIR="${ORDERBOOKS_DISCOVERY_DIR:-${DATA_DIR}/discovery}"
OUTPUT_DIR="${ORDERBOOKS_OUTPUT_DIR:-${DATA_DIR}/raw_orderbooks}"
MANIFEST_DIR="${ORDERBOOKS_MANIFEST_DIR:-${DATA_DIR}/manifests}"
DISCOVERY_JSON="${ORDERBOOKS_DISCOVERY_JSON:-${DISCOVERY_DIR}/polymarket_btc_markets_latest.json}"
DISCOVERY_MANIFEST="${ORDERBOOKS_DISCOVERY_MANIFEST:-${DISCOVERY_DIR}/polymarket_btc_markets_manifest.json}"
DISCOVERY_MARKDOWN="${ORDERBOOKS_DISCOVERY_MARKDOWN:-${DISCOVERY_DIR}/polymarket_btc_markets.md}"
DISCOVERY_LIMIT="${ORDERBOOKS_DISCOVERY_LIMIT:-100}"
DISCOVERY_MAX_PAGES="${ORDERBOOKS_DISCOVERY_MAX_PAGES:-3}"
DISCOVERY_TIMEOUT="${ORDERBOOKS_DISCOVERY_TIMEOUT:-15}"
cycle_id="$(date -u +%Y%m%dT%H%M%SZ)"
COLLECTOR_MANIFEST="${ORDERBOOKS_COLLECTOR_MANIFEST:-${MANIFEST_DIR}/polymarket_orderbook_collector_${cycle_id}.json}"
mkdir -p "${DISCOVERY_DIR}" "${OUTPUT_DIR}" "${MANIFEST_DIR}"
cd "${APP_DIR}"
"${PYTHON_BIN}" scripts/discover_polymarket_btc_markets.py \
--output-json "${DISCOVERY_JSON}" \
--manifest "${DISCOVERY_MANIFEST}" \
--markdown "${DISCOVERY_MARKDOWN}" \
--limit "${DISCOVERY_LIMIT}" \
--max-pages "${DISCOVERY_MAX_PAGES}" \
--timeout "${DISCOVERY_TIMEOUT}"
exec "${PYTHON_BIN}" scripts/collect_polymarket_orderbooks.py \
--config "${COLLECTOR_CONFIG}" \
--discovery-path "${DISCOVERY_JSON}" \
--output-dir "${OUTPUT_DIR}" \
--manifest-path "${COLLECTOR_MANIFEST}"

View file

@ -0,0 +1,90 @@
#!/usr/bin/env bash
set -uo pipefail
APP_DIR="${ORDERBOOKS_APP_DIR:-/app}"
MANIFEST_DIR="${ORDERBOOKS_MANIFEST_DIR:-${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}/manifests}"
LOOP_SLEEP_SECONDS="${ORDERBOOKS_LOOP_SLEEP_SECONDS:-15}"
STOP_REQUESTED=0
CHILD_PID=""
utc_compact() {
date -u +%Y%m%dT%H%M%SZ
}
utc_iso() {
date -u +%Y-%m-%dT%H:%M:%SZ
}
write_loop_event() {
local status="$1"
local exit_code="$2"
local message="$3"
local path="${MANIFEST_DIR%/}/collector_loop_$(utc_compact).json"
mkdir -p "${MANIFEST_DIR}"
PYTHONDONTWRITEBYTECODE=1 python3 - "$path" "$status" "$exit_code" "$message" <<'PY_LOOP_EVENT'
import json
import sys
import datetime as dt
from pathlib import Path
path = Path(sys.argv[1])
status = sys.argv[2]
exit_code = int(sys.argv[3])
message = sys.argv[4]
now = dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z')
path.write_text(json.dumps({
'schema_name': 'collector_loop_event',
'schema_version': 1,
'written_at_utc': now,
'status': status,
'exit_code': exit_code,
'message': message,
}, indent=2, sort_keys=True) + '\n', encoding='utf-8')
PY_LOOP_EVENT
}
request_stop() {
STOP_REQUESTED=1
if [[ -n "${CHILD_PID}" ]] && kill -0 "${CHILD_PID}" >/dev/null 2>&1; then
kill -TERM "${CHILD_PID}" >/dev/null 2>&1 || true
fi
}
trap request_stop INT TERM
mkdir -p "${MANIFEST_DIR}"
cd "${APP_DIR}" || exit 1
echo "collector loop started at $(utc_iso)"
while [[ "${STOP_REQUESTED}" -eq 0 ]]; do
cycle_started="$(utc_iso)"
echo "collector cycle starting at ${cycle_started}"
/bin/bash scripts/run_polymarket_collector_cycle.sh &
CHILD_PID="$!"
wait "${CHILD_PID}"
cycle_exit="$?"
CHILD_PID=""
if [[ "${STOP_REQUESTED}" -ne 0 ]]; then
write_loop_event "INTERRUPTED" "${cycle_exit}" "collector loop received stop request during or after cycle"
break
fi
if [[ "${cycle_exit}" -ne 0 ]]; then
write_loop_event "CYCLE_FAILED" "${cycle_exit}" "collector cycle exited nonzero; loop will continue after sleep"
echo "collector cycle failed with exit ${cycle_exit}; continuing after ${LOOP_SLEEP_SECONDS}s" >&2
else
echo "collector cycle completed at $(utc_iso)"
fi
for ((i = 0; i < LOOP_SLEEP_SECONDS; i++)); do
if [[ "${STOP_REQUESTED}" -ne 0 ]]; then
break
fi
sleep 1
done
done
echo "collector loop stopped at $(utc_iso)"

462
scripts/upload_archive_rclone.sh Executable file
View file

@ -0,0 +1,462 @@
#!/usr/bin/env bash
set -uo pipefail
SCRIPT_NAME="orderbooks_rclone_uploader"
SCRIPT_VERSION="0.1.0"
MODE="dry-run"
CLEANUP_AFTER_VERIFY=0
DATA_DIR="${ORDERBOOKS_UPLOAD_DATA_DIR:-${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}}"
RAW_DIR="${ORDERBOOKS_UPLOAD_RAW_DIR:-}"
SOURCE_MANIFEST_DIR="${ORDERBOOKS_UPLOAD_SOURCE_MANIFEST_DIR:-}"
MANIFEST_DIR="${ORDERBOOKS_UPLOAD_MANIFEST_DIR:-}"
MANIFEST_PATH="${ORDERBOOKS_UPLOAD_MANIFEST_PATH:-}"
DEST="${ORDERBOOKS_RCLONE_DEST:-}"
RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-rclone}"
MIN_AGE_SECONDS="${ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS:-600}"
RETENTION_DAYS="${ORDERBOOKS_UPLOAD_RETENTION_DAYS:-7}"
TRANSFERS="${ORDERBOOKS_RCLONE_TRANSFERS:-4}"
CHECKERS="${ORDERBOOKS_RCLONE_CHECKERS:-8}"
usage() {
cat <<'EOF'
Usage: scripts/upload_archive_rclone.sh [options]
Uploads closed raw collector archive files and manifests with rclone.
Default mode is dry-run. Real upload requires --execute and a destination.
Options:
--dry-run Plan and run rclone copy with --dry-run (default).
--execute Run real rclone copy and rclone check.
--cleanup-after-verify Delete uploaded local files older than retention only after verification.
--data-dir DIR Base data directory. Default: /var/lib/orderbooks.
--raw-dir DIR Raw collector output directory. Default: DATA_DIR/raw_orderbooks.
--source-manifest-dir DIR Source collector manifest directory. Default: DATA_DIR/manifests.
--manifest-dir DIR Upload manifest output directory. Default: DATA_DIR/manifests.
--manifest-path PATH Exact upload manifest path.
--dest REMOTE:PATH rclone destination. Or set ORDERBOOKS_RCLONE_DEST.
--min-age-seconds N Skip files modified within N seconds. Default: 600.
--retention-days N Keep at least N days locally. Default: 7.
--rclone-bin PATH rclone binary path. Default: rclone.
--help Show this help.
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--dry-run)
MODE="dry-run"
shift
;;
--execute)
MODE="execute"
shift
;;
--cleanup-after-verify)
CLEANUP_AFTER_VERIFY=1
shift
;;
--data-dir)
DATA_DIR="$2"
shift 2
;;
--raw-dir)
RAW_DIR="$2"
shift 2
;;
--source-manifest-dir)
SOURCE_MANIFEST_DIR="$2"
shift 2
;;
--manifest-dir)
MANIFEST_DIR="$2"
shift 2
;;
--manifest-path)
MANIFEST_PATH="$2"
shift 2
;;
--dest)
DEST="$2"
shift 2
;;
--min-age-seconds)
MIN_AGE_SECONDS="$2"
shift 2
;;
--retention-days)
RETENTION_DAYS="$2"
shift 2
;;
--rclone-bin)
RCLONE_BIN="$2"
shift 2
;;
--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if [[ -z "${RAW_DIR}" ]]; then
RAW_DIR="${DATA_DIR%/}/raw_orderbooks"
fi
if [[ -z "${SOURCE_MANIFEST_DIR}" ]]; then
SOURCE_MANIFEST_DIR="${DATA_DIR%/}/manifests"
fi
if [[ -z "${MANIFEST_DIR}" ]]; then
MANIFEST_DIR="${DATA_DIR%/}/manifests"
fi
STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
if [[ -z "${MANIFEST_PATH}" ]]; then
MANIFEST_PATH="${MANIFEST_DIR%/}/upload_archive_${RUN_ID}.json"
fi
TMPDIR="$(mktemp -d)"
trap 'rm -rf "${TMPDIR}"' EXIT
PLAN_PATH="${TMPDIR}/plan.json"
RCLONE_COPY_LOG="${TMPDIR}/rclone_copy.log"
RCLONE_CHECK_LOG="${TMPDIR}/rclone_check.log"
CLEANUP_PATH="${TMPDIR}/cleanup.json"
STAGING_DIR="${TMPDIR}/stage"
mkdir -p "$(dirname "${MANIFEST_PATH}")" "${STAGING_DIR}"
python3 - "$DATA_DIR" "$RAW_DIR" "$SOURCE_MANIFEST_DIR" "$MANIFEST_PATH" "$MIN_AGE_SECONDS" "$STAGING_DIR" "$PLAN_PATH" <<'PY'
import datetime as dt
import hashlib
import json
import os
import shutil
import sys
from pathlib import Path
data_dir = Path(sys.argv[1])
raw_dir = Path(sys.argv[2])
source_manifest_dir = Path(sys.argv[3])
manifest_path = Path(sys.argv[4]).resolve()
min_age_seconds = int(sys.argv[5])
staging_dir = Path(sys.argv[6])
plan_path = Path(sys.argv[7])
now = dt.datetime.now(dt.UTC)
def iso_z_from_ts(ts: float) -> str:
return dt.datetime.fromtimestamp(ts, dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
def sha256_file(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def rel_for(path: Path) -> str:
resolved = path.resolve()
try:
return resolved.relative_to(data_dir.resolve()).as_posix()
except ValueError:
return resolved.name
def iter_files(root: Path):
if not root.exists():
return
for path in sorted(root.rglob("*")):
if path.is_file():
yield path
selected = []
skipped = []
warnings = []
seen = set()
for root, kind in [(raw_dir, "raw"), (source_manifest_dir, "manifest")]:
if not root.exists():
warnings.append(f"{kind} source directory does not exist: {root}")
continue
for path in iter_files(root):
resolved = path.resolve()
if resolved in seen:
continue
seen.add(resolved)
rel = rel_for(path)
stat = path.stat()
age_seconds = max(0, int(now.timestamp() - stat.st_mtime))
base = {
"local_path": str(path),
"relative_path": rel,
"kind": kind,
"bytes": stat.st_size,
"mtime_utc": iso_z_from_ts(stat.st_mtime),
"age_seconds": age_seconds,
}
if resolved == manifest_path:
skipped.append({**base, "reason": "current_upload_manifest"})
continue
if age_seconds < min_age_seconds:
skipped.append({**base, "reason": "modified_within_min_age_seconds"})
continue
checksum = sha256_file(path)
staged_path = staging_dir / rel
staged_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(path, staged_path)
selected.append({**base, "sha256": checksum, "staged_path": str(staged_path)})
plan = {
"selected_files": selected,
"skipped_files": skipped,
"warnings": warnings,
}
plan_path.write_text(json.dumps(plan, indent=2, sort_keys=True) + "\n", encoding="utf-8")
PY
RCLONE_AVAILABLE=0
RCLONE_VERSION=""
if command -v "${RCLONE_BIN}" >/dev/null 2>&1; then
RCLONE_AVAILABLE=1
RCLONE_VERSION="$("${RCLONE_BIN}" version 2>/dev/null | head -n 1 || true)"
fi
DEST_CONFIGURED=0
if [[ -n "${DEST}" ]]; then
DEST_CONFIGURED=1
fi
COPY_EXIT_CODE=""
CHECK_EXIT_CODE=""
COPY_ATTEMPTED=0
CHECK_ATTEMPTED=0
OPERATION_STATUS="PLANNED"
GATE_STATUS="BLOCKED_REAL_UPLOAD"
if [[ "${DEST_CONFIGURED}" -eq 0 ]]; then
OPERATION_STATUS="BLOCKED_DEST_MISSING"
elif [[ "${RCLONE_AVAILABLE}" -eq 0 ]]; then
OPERATION_STATUS="BLOCKED_RCLONE_UNAVAILABLE"
else
COPY_ATTEMPTED=1
copy_args=(copy "${STAGING_DIR}/" "${DEST%/}/" --checksum --transfers "${TRANSFERS}" --checkers "${CHECKERS}")
if [[ "${MODE}" == "dry-run" ]]; then
copy_args+=(--dry-run)
fi
"${RCLONE_BIN}" "${copy_args[@]}" >"${RCLONE_COPY_LOG}" 2>&1
COPY_EXIT_CODE=$?
if [[ "${COPY_EXIT_CODE}" -eq 0 && "${MODE}" == "dry-run" ]]; then
OPERATION_STATUS="DRY_RUN_PASS"
elif [[ "${COPY_EXIT_CODE}" -eq 0 ]]; then
CHECK_ATTEMPTED=1
"${RCLONE_BIN}" check "${STAGING_DIR}/" "${DEST%/}/" --one-way --checksum >"${RCLONE_CHECK_LOG}" 2>&1
CHECK_EXIT_CODE=$?
if [[ "${CHECK_EXIT_CODE}" -eq 0 ]]; then
OPERATION_STATUS="UPLOAD_VERIFIED"
GATE_STATUS="PASS"
else
OPERATION_STATUS="VERIFY_FAILED"
GATE_STATUS="FAIL"
fi
else
OPERATION_STATUS="COPY_FAILED"
GATE_STATUS="FAIL"
fi
fi
python3 - "$PLAN_PATH" "$CLEANUP_PATH" "$MODE" "$CLEANUP_AFTER_VERIFY" "$RETENTION_DAYS" "$OPERATION_STATUS" "$GATE_STATUS" <<'PY'
import datetime as dt
import json
import sys
from pathlib import Path
plan_path = Path(sys.argv[1])
cleanup_path = Path(sys.argv[2])
mode = sys.argv[3]
cleanup_after_verify = sys.argv[4] == "1"
retention_days = int(sys.argv[5])
operation_status = sys.argv[6]
gate_status = sys.argv[7]
plan = json.loads(plan_path.read_text())
now = dt.datetime.now(dt.UTC)
cutoff = now - dt.timedelta(days=retention_days)
retained = []
deleted = []
if mode == "execute" and cleanup_after_verify and operation_status == "UPLOAD_VERIFIED":
for item in plan["selected_files"]:
path = Path(item["local_path"])
mtime = dt.datetime.fromtimestamp(path.stat().st_mtime, dt.UTC) if path.exists() else now
if mtime < cutoff and path.exists():
path.unlink()
deleted.append({**item, "deleted_at_utc": now.replace(microsecond=0).isoformat().replace("+00:00", "Z")})
else:
retained.append({**item, "reason": "within_retention_window" if mtime >= cutoff else "missing_before_cleanup"})
else:
reason = "cleanup_not_requested"
if mode != "execute":
reason = "dry_run"
elif operation_status != "UPLOAD_VERIFIED":
reason = "not_verified"
for item in plan["selected_files"]:
retained.append({**item, "reason": reason})
cleanup_path.write_text(
json.dumps({"retained_local_files": retained, "deleted_local_files": deleted}, indent=2, sort_keys=True) + "\n",
encoding="utf-8",
)
PY
ENDED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
export SCRIPT_NAME SCRIPT_VERSION STARTED_AT ENDED_AT
export MODE OPERATION_STATUS GATE_STATUS
export RCLONE_BIN RCLONE_AVAILABLE RCLONE_VERSION DEST
export COPY_ATTEMPTED CHECK_ATTEMPTED COPY_EXIT_CODE CHECK_EXIT_CODE
export DATA_DIR RAW_DIR SOURCE_MANIFEST_DIR MIN_AGE_SECONDS RETENTION_DAYS CLEANUP_AFTER_VERIFY
python3 - "$PLAN_PATH" "$CLEANUP_PATH" "$MANIFEST_PATH" <<'PY'
import json
import os
import sys
from pathlib import Path
plan = json.loads(Path(sys.argv[1]).read_text())
cleanup = json.loads(Path(sys.argv[2]).read_text())
manifest_path = Path(sys.argv[3])
mode = os.environ["MODE"]
operation_status = os.environ["OPERATION_STATUS"]
gate_status = os.environ["GATE_STATUS"]
copy_attempted = os.environ["COPY_ATTEMPTED"] == "1"
check_attempted = os.environ["CHECK_ATTEMPTED"] == "1"
copy_exit_code = os.environ["COPY_EXIT_CODE"]
check_exit_code = os.environ["CHECK_EXIT_CODE"]
dest = os.environ["DEST"]
def public_item(item):
public = dict(item)
public.pop("staged_path", None)
return public
selected = [public_item(item) for item in plan["selected_files"]]
skipped = [public_item(item) for item in plan["skipped_files"]]
retained_local = [public_item(item) for item in cleanup["retained_local_files"]]
deleted_local = [public_item(item) for item in cleanup["deleted_local_files"]]
attempted_files = selected if copy_attempted else []
uploaded_files = selected if mode == "execute" and operation_status in {"UPLOAD_VERIFIED", "VERIFY_FAILED"} else []
verified_files = selected if mode == "execute" and operation_status == "UPLOAD_VERIFIED" else []
dry_run_files = selected if mode == "dry-run" and operation_status == "DRY_RUN_PASS" else []
manifest = {
"schema_name": "upload_archive_manifest",
"schema_version": 1,
"checkpoint_id": 7,
"checkpoint_name": "Google Drive Offload",
"uploader": {
"name": os.environ["SCRIPT_NAME"],
"version": os.environ["SCRIPT_VERSION"],
},
"started_at_utc": os.environ["STARTED_AT"],
"ended_at_utc": os.environ["ENDED_AT"],
"command_mode": mode,
"operation_status": operation_status,
"gate_status": gate_status,
"rclone": {
"binary": os.environ["RCLONE_BIN"],
"available": os.environ["RCLONE_AVAILABLE"] == "1",
"version": os.environ["RCLONE_VERSION"],
"destination_configured": bool(dest),
"destination": dest if dest else None,
"copy_attempted": copy_attempted,
"copy_exit_code": int(copy_exit_code) if copy_exit_code else None,
"check_attempted": check_attempted,
"check_exit_code": int(check_exit_code) if check_exit_code else None,
},
"config": {
"data_dir": os.environ["DATA_DIR"],
"raw_dir": os.environ["RAW_DIR"],
"source_manifest_dir": os.environ["SOURCE_MANIFEST_DIR"],
"manifest_path": str(manifest_path),
"min_age_seconds": int(os.environ["MIN_AGE_SECONDS"]),
"retention_days": int(os.environ["RETENTION_DAYS"]),
"cleanup_after_verify": os.environ["CLEANUP_AFTER_VERIFY"] == "1",
},
"planned_files": selected,
"attempted_files": attempted_files,
"dry_run_files": dry_run_files,
"uploaded_files": uploaded_files,
"verified_files": verified_files,
"skipped_open_or_recent_files": [
item for item in skipped if item.get("reason") == "modified_within_min_age_seconds"
],
"skipped_files": skipped,
"retained_local_files": retained_local,
"deleted_local_files": deleted_local,
"counts": {
"planned": len(selected),
"attempted": len(attempted_files),
"dry_run": len(dry_run_files),
"uploaded": len(uploaded_files),
"verified": len(verified_files),
"skipped": len(skipped),
"retained_local": len(retained_local),
"deleted_local": len(deleted_local),
},
"warnings": plan["warnings"],
"known_gaps": [
"A dry-run does not prove remote write access.",
"Real upload requires a configured rclone remote outside the repository.",
"Local files are retained unless --cleanup-after-verify is used after successful verification.",
],
}
if operation_status == "BLOCKED_RCLONE_UNAVAILABLE":
manifest["warnings"].append("rclone binary was not available; copy and verification were not attempted.")
if operation_status == "BLOCKED_DEST_MISSING":
manifest["warnings"].append("No rclone destination was configured; set --dest or ORDERBOOKS_RCLONE_DEST.")
if mode == "dry-run":
manifest["warnings"].append("Dry-run mode does not perform a real upload; checkpoint real-upload gate remains blocked.")
manifest_path.parent.mkdir(parents=True, exist_ok=True)
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
print(
json.dumps(
{
"gate_status": gate_status,
"operation_status": operation_status,
"manifest_path": str(manifest_path),
"planned_files": len(selected),
"attempted_files": len(attempted_files),
"uploaded_files": len(uploaded_files),
"verified_files": len(verified_files),
"skipped_files": len(plan["skipped_files"]),
},
indent=2,
sort_keys=True,
)
)
PY
case "${OPERATION_STATUS}" in
UPLOAD_VERIFIED|DRY_RUN_PASS)
exit 0
;;
BLOCKED_DEST_MISSING)
echo "No rclone destination configured. Set --dest or ORDERBOOKS_RCLONE_DEST." >&2
exit 2
;;
BLOCKED_RCLONE_UNAVAILABLE)
echo "rclone is not available. Install rclone before running dry-run or execute mode." >&2
exit 3
;;
*)
echo "Upload operation failed with status: ${OPERATION_STATUS}" >&2
exit 1
;;
esac

285
scripts/vps_preflight_check.sh Executable file
View file

@ -0,0 +1,285 @@
#!/usr/bin/env bash
set -uo pipefail
APP_DIR="$(pwd)"
PYTHON_BIN="${ORDERBOOKS_PYTHON:-python3}"
RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-rclone}"
RCLONE_REMOTE="${ORDERBOOKS_RCLONE_DEST:-}"
DATA_DIR=""
MANIFEST_DIR=""
LOG_DIR=""
MIN_FREE_GIB="${ORDERBOOKS_PREFLIGHT_MIN_FREE_GIB:-5}"
REMOTE_TIMEOUT_SECONDS="${ORDERBOOKS_PREFLIGHT_REMOTE_TIMEOUT_SECONDS:-30}"
FAILURES=0
WARNINGS=0
usage() {
cat <<'EOF'
Usage: scripts/vps_preflight_check.sh [options]
Read-only VPS cutover preflight for the Polymarket order-book collector.
Default behavior checks the repository, local tooling, unit syntax, disk space,
and rclone availability. It does not print rclone config and does not require
secrets.
Options:
--app-dir DIR Repository checkout path. Default: current directory.
--python-bin PATH Python interpreter. Default: ORDERBOOKS_PYTHON or python3.
--rclone-bin PATH rclone binary. Default: ORDERBOOKS_RCLONE_BIN or rclone.
--rclone-remote REMOTE Optional remote/path to check read-only, e.g. gdrive:orderbooks/polymarket.
--data-dir DIR Optional target data directory to create/check writable.
--manifest-dir DIR Optional target manifest directory to create/check writable.
--log-dir DIR Optional target log directory to create/check writable.
--min-free-gib N Minimum free GiB for checked filesystems. Default: 5.
--remote-timeout-seconds N Timeout for rclone remote read check. Default: 30.
--help Show this help.
Directory options intentionally create missing directories before checking
writability. Omit them for a repo-only read-only check.
EOF
}
log_pass() { printf 'PASS %s\n' "$*"; }
log_info() { printf 'INFO %s\n' "$*"; }
log_warn() { WARNINGS=$((WARNINGS + 1)); printf 'WARN %s\n' "$*"; }
log_fail() { FAILURES=$((FAILURES + 1)); printf 'FAIL %s\n' "$*"; }
run_quiet() { "$@" >/dev/null 2>&1; }
while [[ $# -gt 0 ]]; do
case "$1" in
--app-dir) APP_DIR="$2"; shift 2 ;;
--python-bin) PYTHON_BIN="$2"; shift 2 ;;
--rclone-bin) RCLONE_BIN="$2"; shift 2 ;;
--rclone-remote) RCLONE_REMOTE="$2"; shift 2 ;;
--data-dir) DATA_DIR="$2"; shift 2 ;;
--manifest-dir) MANIFEST_DIR="$2"; shift 2 ;;
--log-dir) LOG_DIR="$2"; shift 2 ;;
--min-free-gib) MIN_FREE_GIB="$2"; shift 2 ;;
--remote-timeout-seconds) REMOTE_TIMEOUT_SECONDS="$2"; shift 2 ;;
--help) usage; exit 0 ;;
*) log_fail "unknown argument: $1"; usage >&2; exit 2 ;;
esac
done
APP_DIR="${APP_DIR%/}"
if [[ ! -d "${APP_DIR}" ]]; then
log_fail "app directory does not exist: ${APP_DIR}"
printf 'SUMMARY failures=%s warnings=%s\n' "${FAILURES}" "${WARNINGS}"
exit 1
fi
cd "${APP_DIR}" || {
log_fail "could not cd to app directory: ${APP_DIR}"
printf 'SUMMARY failures=%s warnings=%s\n' "${FAILURES}" "${WARNINGS}"
exit 1
}
check_python() {
if command -v "${PYTHON_BIN}" >/dev/null 2>&1; then
version="$("${PYTHON_BIN}" --version 2>&1 || true)"
log_pass "python available: ${PYTHON_BIN} (${version})"
else
log_fail "python not found: ${PYTHON_BIN}"
fi
}
check_required_files() {
local missing=0 file
local required=(
"scripts/discover_polymarket_btc_markets.py"
"scripts/collect_polymarket_orderbooks.py"
"scripts/normalize_polymarket_orderbooks.py"
"scripts/run_polymarket_collector_cycle.sh"
"scripts/upload_archive_rclone.sh"
"scripts/vps_runtime_smoke_check.sh"
"config/polymarket_collector.vps.example.yaml"
"docs/VPS_DEPLOYMENT.md"
"docs/GOOGLE_DRIVE_OFFLOAD.md"
"systemd/polymarket-orderbook-collector.service"
"systemd/polymarket-orderbook-uploader.service"
"systemd/polymarket-orderbook-uploader.timer"
)
for file in "${required[@]}"; do
if [[ -f "${file}" ]]; then
log_pass "required file exists: ${file}"
else
missing=1
log_fail "required file missing: ${file}"
fi
done
return "${missing}"
}
check_python_compile() {
if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then
log_fail "cannot compile Python scripts because Python is missing"
return
fi
if run_quiet "${PYTHON_BIN}" - <<'PY'
from pathlib import Path
paths = [
Path("scripts/discover_polymarket_btc_markets.py"),
Path("scripts/collect_polymarket_orderbooks.py"),
Path("scripts/normalize_polymarket_orderbooks.py"),
]
for path in paths:
source = path.read_text(encoding="utf-8")
compile(source, str(path), "exec")
PY
then
log_pass "collector/discovery/normalization Python scripts compile without bytecode writes"
else
log_fail "Python no-bytecode compile check failed"
fi
}
check_shell_syntax() {
local failed=0 script
for script in scripts/*.sh; do
[[ -f "${script}" ]] || continue
if bash -n "${script}" >/dev/null 2>&1; then
log_pass "bash syntax ok: ${script}"
else
failed=1
log_fail "bash syntax failed: ${script}"
fi
done
return "${failed}"
}
check_systemd_units() {
local units=(
"systemd/polymarket-orderbook-collector.service"
"systemd/polymarket-orderbook-uploader.service"
"systemd/polymarket-orderbook-uploader.timer"
)
if command -v systemd-analyze >/dev/null 2>&1; then
if systemd-analyze verify "${units[@]}" >/dev/null 2>&1; then
log_pass "systemd units parse with systemd-analyze"
else
log_fail "systemd-analyze verify failed for one or more units"
fi
else
log_warn "systemd-analyze unavailable; skipped unit parse check"
fi
}
remote_name_from_dest() {
local dest="$1"
case "${dest}" in
*:*) printf '%s:\n' "${dest%%:*}" ;;
*) printf '\n' ;;
esac
}
run_with_timeout() {
if command -v timeout >/dev/null 2>&1; then
timeout "${REMOTE_TIMEOUT_SECONDS}" "$@"
else
"$@"
fi
}
check_rclone() {
if [[ -x "${RCLONE_BIN}" ]] || command -v "${RCLONE_BIN}" >/dev/null 2>&1; then
version="$("${RCLONE_BIN}" version 2>/dev/null | head -n 1 || true)"
log_pass "rclone available: ${RCLONE_BIN} (${version})"
else
log_fail "rclone not found: ${RCLONE_BIN}"
return
fi
if [[ -z "${RCLONE_REMOTE}" ]]; then
log_info "no rclone remote provided; skipped remote access check"
return
fi
local remote_name
remote_name="$(remote_name_from_dest "${RCLONE_REMOTE}")"
if [[ -z "${remote_name}" ]]; then
log_fail "rclone remote must include a remote name ending in ':': ${RCLONE_REMOTE}"
return
fi
if "${RCLONE_BIN}" listremotes 2>/dev/null | grep -Fxq "${remote_name}"; then
log_pass "rclone remote is configured: ${remote_name}"
else
log_fail "rclone remote is not configured or not visible to this user: ${remote_name}"
return
fi
if run_with_timeout "${RCLONE_BIN}" lsf --max-depth 1 "${RCLONE_REMOTE}" >/dev/null 2>&1; then
log_pass "rclone remote read check succeeded without printing config: ${RCLONE_REMOTE}"
else
log_fail "rclone remote read check failed or timed out: ${RCLONE_REMOTE}"
fi
}
check_target_dir() {
local label="$1" path="$2"
if [[ -z "${path}" ]]; then
log_info "no ${label} directory provided; skipped create/write check"
return
fi
if mkdir -p "${path}" >/dev/null 2>&1 && [[ -d "${path}" && -w "${path}" ]]; then
log_pass "${label} directory exists and is writable: ${path}"
else
log_fail "${label} directory cannot be created or is not writable: ${path}"
fi
}
check_disk_free() {
local target="$1" label="$2" available_kib min_kib
if [[ ! -e "${target}" ]]; then
log_warn "disk target does not exist, skipping ${label}: ${target}"
return
fi
available_kib="$(df -Pk "${target}" | awk 'NR==2 {print $4}')"
min_kib=$((MIN_FREE_GIB * 1024 * 1024))
if [[ -n "${available_kib}" && "${available_kib}" -ge "${min_kib}" ]]; then
log_pass "disk free ok for ${label}: available_kib=${available_kib} min_gib=${MIN_FREE_GIB}"
else
log_fail "disk free below threshold for ${label}: available_kib=${available_kib:-unknown} min_gib=${MIN_FREE_GIB}"
fi
}
check_secret_requirements() {
local files=(
"config/polymarket_collector.vps.example.yaml"
"systemd/polymarket-orderbook-collector.service"
"systemd/polymarket-orderbook-uploader.service"
"systemd/polymarket-orderbook-uploader.timer"
"scripts/run_polymarket_collector_cycle.sh"
"scripts/upload_archive_rclone.sh"
)
if grep -E -i '(api[_-]?key|private[_-]?key|mnemonic|wallet|password|client[_-]?secret|access[_-]?token|refresh[_-]?token)' "${files[@]}" >/dev/null 2>&1; then
log_fail "secret-like credential requirement found in runtime config, units, or scripts"
else
log_pass "no API keys, private keys, mnemonics, wallets, or passwords are required by runtime files"
fi
log_info "rclone credentials, if used, must remain machine-local outside the repository"
}
check_python
check_required_files
check_python_compile
check_shell_syntax
check_systemd_units
check_rclone
check_target_dir "data" "${DATA_DIR}"
check_target_dir "manifest" "${MANIFEST_DIR}"
check_target_dir "log" "${LOG_DIR}"
check_disk_free "." "repository"
if [[ -n "${DATA_DIR}" && -d "${DATA_DIR}" ]]; then
check_disk_free "${DATA_DIR}" "data directory"
fi
check_secret_requirements
printf 'SUMMARY failures=%s warnings=%s\n' "${FAILURES}" "${WARNINGS}"
if [[ "${FAILURES}" -eq 0 ]]; then
exit 0
fi
exit 1

View file

@ -0,0 +1,279 @@
#!/usr/bin/env bash
set -uo pipefail
APP_DIR="${ORDERBOOKS_APP_DIR:-/opt/orderbooks}"
DATA_DIR="${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}"
RAW_DIR="${ORDERBOOKS_OUTPUT_DIR:-${DATA_DIR}/raw_orderbooks}"
MANIFEST_DIR="${ORDERBOOKS_MANIFEST_DIR:-${DATA_DIR}/manifests}"
COLLECTOR_SERVICE="${ORDERBOOKS_COLLECTOR_SERVICE:-polymarket-orderbook-collector.service}"
UPLOADER_SERVICE="${ORDERBOOKS_UPLOADER_SERVICE:-polymarket-orderbook-uploader.service}"
WAIT_SECONDS="${ORDERBOOKS_SMOKE_WAIT_SECONDS:-900}"
RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
EVIDENCE_PATH="${ORDERBOOKS_SMOKE_EVIDENCE_PATH:-${MANIFEST_DIR}/vps_runtime_smoke_${RUN_ID}.json}"
PYTHON_BIN="${ORDERBOOKS_PYTHON:-python3}"
usage() {
cat <<'EOF'
Usage: scripts/vps_runtime_smoke_check.sh [options]
Run on the VPS after installing collector/uploader systemd units. The check
records durable JSON evidence, forces one collector service restart, verifies
old raw gzip files still parse and keep their checksum, waits for a later valid
collector cycle, then starts the uploader service and records upload evidence.
Options:
--app-dir DIR App checkout. Default: /opt/orderbooks.
--data-dir DIR Data root. Default: /var/lib/orderbooks.
--raw-dir DIR Raw output dir. Default: DATA_DIR/raw_orderbooks.
--manifest-dir DIR Manifest dir. Default: DATA_DIR/manifests.
--collector-service NAME systemd collector service name.
--uploader-service NAME systemd uploader service name.
--wait-seconds N Max wait for valid cycles. Default: 900.
--evidence-path PATH JSON evidence output path.
--help Show this help.
This script does not delete raw files or manifests. Failures are written to the
evidence JSON and should be preserved for review.
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--app-dir) APP_DIR="$2"; shift 2 ;;
--data-dir) DATA_DIR="$2"; RAW_DIR="${ORDERBOOKS_OUTPUT_DIR:-$2/raw_orderbooks}"; MANIFEST_DIR="${ORDERBOOKS_MANIFEST_DIR:-$2/manifests}"; shift 2 ;;
--raw-dir) RAW_DIR="$2"; shift 2 ;;
--manifest-dir) MANIFEST_DIR="$2"; shift 2 ;;
--collector-service) COLLECTOR_SERVICE="$2"; shift 2 ;;
--uploader-service) UPLOADER_SERVICE="$2"; shift 2 ;;
--wait-seconds) WAIT_SECONDS="$2"; shift 2 ;;
--evidence-path) EVIDENCE_PATH="$2"; shift 2 ;;
--help) usage; exit 0 ;;
*) echo "Unknown argument: $1" >&2; usage >&2; exit 2 ;;
esac
done
mkdir -p "$(dirname "${EVIDENCE_PATH}")"
PYTHONDONTWRITEBYTECODE=1 "${PYTHON_BIN}" - "$APP_DIR" "$DATA_DIR" "$RAW_DIR" "$MANIFEST_DIR" "$COLLECTOR_SERVICE" "$UPLOADER_SERVICE" "$WAIT_SECONDS" "$EVIDENCE_PATH" <<'PY_SMOKE'
import datetime as dt
import gzip
import hashlib
import json
import subprocess
import sys
import time
from pathlib import Path
app_dir = Path(sys.argv[1])
data_dir = Path(sys.argv[2])
raw_dir = Path(sys.argv[3])
manifest_dir = Path(sys.argv[4])
collector_service = sys.argv[5]
uploader_service = sys.argv[6]
wait_seconds = int(sys.argv[7])
evidence_path = Path(sys.argv[8])
started = dt.datetime.now(dt.UTC).replace(microsecond=0)
checks = []
failures = []
def iso_now():
return dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z')
def run(command):
proc = subprocess.run(command, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
item = {
'command': command,
'exit_code': proc.returncode,
'stdout_tail': proc.stdout[-4000:],
'stderr_tail': proc.stderr[-4000:],
'ran_at_utc': iso_now(),
}
checks.append(item)
return item
def sha256(path):
digest = hashlib.sha256()
with path.open('rb') as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b''):
digest.update(chunk)
return digest.hexdigest()
def parse_raw(path):
rows = 0
first_keys = []
with gzip.open(path, 'rt', encoding='utf-8') as handle:
for line in handle:
if not line.strip():
continue
obj = json.loads(line)
if rows == 0:
first_keys = sorted(obj.keys())
rows += 1
return rows, first_keys
def collector_manifests():
if not manifest_dir.exists():
return []
return sorted(manifest_dir.glob('polymarket_orderbook_collector_*.json'), key=lambda path: path.stat().st_mtime)
def validate_collector(path):
manifest = json.loads(path.read_text(encoding='utf-8'))
output_files = []
for item in manifest.get('output_files', []):
raw_path = Path(item['path'])
rows, first_keys = parse_raw(raw_path)
actual_sha = sha256(raw_path)
output_files.append({
'path': str(raw_path),
'bytes': raw_path.stat().st_size,
'manifest_rows': item.get('rows'),
'rows_parsed': rows,
'row_count_matches_manifest': rows == item.get('rows'),
'manifest_sha256': item.get('sha256'),
'actual_sha256': actual_sha,
'sha256_matches_manifest': actual_sha == item.get('sha256'),
'first_row_keys': first_keys,
'under_raw_dir': raw_path.resolve().is_relative_to(raw_dir.resolve()),
'uses_live_sample_path': 'live_sample' in raw_path.parts,
})
valid = (
manifest.get('gate_status') == 'PASS'
and manifest.get('rows_written', 0) > 0
and manifest.get('failure_count') == 0
and not manifest.get('failures')
and bool(output_files)
and all(item['rows_parsed'] > 0 and item['row_count_matches_manifest'] and item['sha256_matches_manifest'] and item['under_raw_dir'] and not item['uses_live_sample_path'] for item in output_files)
)
return {
'path': str(path),
'manifest': manifest,
'output_files': output_files,
'valid': valid,
}
def latest_valid_after(after_mtime=0):
deadline = time.time() + wait_seconds
last_error = None
while time.time() <= deadline:
for path in reversed(collector_manifests()):
if path.stat().st_mtime <= after_mtime:
continue
try:
result = validate_collector(path)
except Exception as exc:
last_error = str(exc)
continue
if result['valid']:
return result
last_error = f"latest candidate invalid: {path}"
time.sleep(10)
raise TimeoutError(last_error or f'no valid collector manifest after mtime {after_mtime}')
def latest_upload_after(after_mtime=0):
candidates = sorted(manifest_dir.glob('upload_archive_*.json'), key=lambda path: path.stat().st_mtime)
candidates = [path for path in candidates if path.stat().st_mtime >= after_mtime]
if not candidates:
raise FileNotFoundError('no upload_archive_*.json manifest found after uploader run')
path = candidates[-1]
manifest = json.loads(path.read_text(encoding='utf-8'))
verified_count = manifest.get('counts', {}).get('verified', len(manifest.get('verified_files', [])))
return {
'path': str(path),
'manifest': manifest,
'verified_count': verified_count,
'valid': manifest.get('operation_status') == 'UPLOAD_VERIFIED' and manifest.get('gate_status') == 'PASS' and manifest.get('rclone', {}).get('copy_exit_code') == 0 and manifest.get('rclone', {}).get('check_exit_code') == 0 and verified_count > 0,
}
summary = {
'schema_name': 'vps_runtime_smoke_result',
'schema_version': 1,
'started_at_utc': started.isoformat().replace('+00:00', 'Z'),
'ended_at_utc': None,
'gate_status': 'ERROR',
'production_ready': False,
'app_dir': str(app_dir),
'data_dir': str(data_dir),
'raw_dir': str(raw_dir),
'manifest_dir': str(manifest_dir),
'collector_service': collector_service,
'uploader_service': uploader_service,
'wait_seconds': wait_seconds,
'checks': checks,
'failures': failures,
}
try:
active = run(['systemctl', 'is-active', collector_service])
if active['exit_code'] != 0:
failures.append('collector service is not active under systemd')
raise RuntimeError('collector service not active')
before = latest_valid_after(0)
before_mtime = Path(before['path']).stat().st_mtime
old_raw = before['output_files'][0]
old_raw_sha = old_raw['actual_sha256']
old_raw_path = Path(old_raw['path'])
restart = run(['systemctl', 'restart', collector_service])
if restart['exit_code'] != 0:
failures.append('collector service restart command failed')
raise RuntimeError('restart failed')
active_after = run(['systemctl', 'is-active', collector_service])
if active_after['exit_code'] != 0:
failures.append('collector service is not active after restart')
raise RuntimeError('collector inactive after restart')
after = latest_valid_after(before_mtime)
old_rows_after, _ = parse_raw(old_raw_path)
old_file_unchanged = sha256(old_raw_path) == old_raw_sha and old_rows_after == old_raw['rows_parsed']
if not old_file_unchanged:
failures.append('raw file from before restart changed or stopped parsing')
upload_start_mtime = time.time()
upload_run = run(['systemctl', 'start', uploader_service])
if upload_run['exit_code'] != 0:
failures.append('uploader service start failed')
try:
upload = latest_upload_after(upload_start_mtime - 2)
if not upload.get('valid'):
failures.append('uploader did not produce a verified upload manifest with at least one verified file')
except Exception as exc:
upload = {'path': None, 'valid': False, 'error': str(exc)}
failures.append(str(exc))
collector_logs = run(['journalctl', '-u', collector_service, '-n', '80', '--no-pager'])
uploader_logs = run(['journalctl', '-u', uploader_service, '-n', '80', '--no-pager'])
summary.update({
'before_restart_collector': before,
'after_restart_collector': after,
'old_raw_file_unchanged_after_restart': old_file_unchanged,
'upload_result': upload,
'collector_log_check_exit_code': collector_logs['exit_code'],
'uploader_log_check_exit_code': uploader_logs['exit_code'],
})
if after['valid'] and old_file_unchanged and upload.get('valid') and not failures:
summary['gate_status'] = 'PASS'
else:
summary['gate_status'] = 'FAIL'
except Exception as exc:
failures.append(str(exc))
summary['exception'] = repr(exc)
finally:
summary['ended_at_utc'] = iso_now()
evidence_path.parent.mkdir(parents=True, exist_ok=True)
evidence_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + '\n', encoding='utf-8')
print(f"SMOKE_EVIDENCE={evidence_path}")
print(f"SMOKE_GATE={summary['gate_status']}")
if summary['gate_status'] != 'PASS':
sys.exit(1)
PY_SMOKE

View file

@ -0,0 +1,38 @@
[Unit]
Description=Polymarket raw order-book collector cycle
Documentation=file:/opt/orderbooks/docs/VPS_DEPLOYMENT.md
After=network-online.target
Wants=network-online.target
StartLimitIntervalSec=10min
StartLimitBurst=20
[Service]
Type=simple
User=orderbooks
Group=orderbooks
WorkingDirectory=/opt/orderbooks
Environment=PYTHONUNBUFFERED=1
Environment=ORDERBOOKS_APP_DIR=/opt/orderbooks
Environment=ORDERBOOKS_DATA_DIR=/var/lib/orderbooks
Environment=ORDERBOOKS_OUTPUT_DIR=/var/lib/orderbooks/raw_orderbooks
Environment=ORDERBOOKS_PYTHON=/opt/orderbooks/.venv/bin/python
Environment=ORDERBOOKS_COLLECTOR_CONFIG=/etc/orderbooks/polymarket_collector.vps.yaml
EnvironmentFile=-/etc/orderbooks/polymarket-orderbook-collector.env
ExecStart=/bin/bash /opt/orderbooks/scripts/run_polymarket_collector_cycle.sh
Restart=always
RestartSec=30s
TimeoutStopSec=90s
KillSignal=SIGTERM
KillMode=control-group
StandardOutput=journal
StandardError=journal
SyslogIdentifier=polymarket-orderbook-collector
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths=/var/lib/orderbooks
StateDirectory=orderbooks
[Install]
WantedBy=multi-user.target

View file

@ -0,0 +1,29 @@
[Unit]
Description=Orderbooks archive upload via rclone
Documentation=file:/opt/orderbooks/docs/GOOGLE_DRIVE_OFFLOAD.md
After=network-online.target
Wants=network-online.target
[Service]
Type=oneshot
User=orderbooks
Group=orderbooks
WorkingDirectory=/opt/orderbooks
Environment=ORDERBOOKS_UPLOAD_DATA_DIR=/var/lib/orderbooks
Environment=ORDERBOOKS_UPLOAD_MANIFEST_DIR=/var/lib/orderbooks/manifests
Environment=ORDERBOOKS_UPLOAD_RAW_DIR=/var/lib/orderbooks/raw_orderbooks
Environment=ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600
Environment=ORDERBOOKS_UPLOAD_RETENTION_DAYS=7
Environment=ORDERBOOKS_RCLONE_BIN=/usr/bin/rclone
EnvironmentFile=-/etc/orderbooks/orderbook-uploader.env
ExecStart=/bin/bash /opt/orderbooks/scripts/upload_archive_rclone.sh --execute
StandardOutput=journal
StandardError=journal
SyslogIdentifier=polymarket-orderbook-uploader
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths=/var/lib/orderbooks
StateDirectory=orderbooks

View file

@ -0,0 +1,12 @@
[Unit]
Description=Run orderbooks archive upload periodically
Documentation=file:/opt/orderbooks/docs/GOOGLE_DRIVE_OFFLOAD.md
[Timer]
OnCalendar=hourly
RandomizedDelaySec=10min
Persistent=true
Unit=polymarket-orderbook-uploader.service
[Install]
WantedBy=timers.target