Prepare Kubernetes orderbooks deployment
Some checks failed
deploy / deploy (push) Has been cancelled
Some checks failed
deploy / deploy (push) Has been cancelled
This commit is contained in:
commit
284e465588
42 changed files with 8640 additions and 0 deletions
26
.dockerignore
Normal file
26
.dockerignore
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
.git/
|
||||
.venv/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
.ruff_cache/
|
||||
artifacts/
|
||||
data/
|
||||
reports/
|
||||
orchestration/
|
||||
.env
|
||||
*.env
|
||||
rclone.conf
|
||||
**/rclone.conf
|
||||
*.pem
|
||||
*.key
|
||||
*.p12
|
||||
*.pfx
|
||||
id_rsa*
|
||||
id_ed25519*
|
||||
*mnemonic*
|
||||
*wallet*
|
||||
*credential*
|
||||
*secret*
|
||||
162
.forgejo/workflows/deploy.yml
Normal file
162
.forgejo/workflows/deploy.yml
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
name: deploy
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
runs-on: linux-amd64
|
||||
env:
|
||||
IMAGE_TAG: ${{ github.sha }}
|
||||
REGISTRY_HOST: ${{ vars.REGISTRY_HOST }}
|
||||
PROJECT_NAME: ${{ vars.PROJECT_NAME || 'orderbooks' }}
|
||||
PROJECT_NAMESPACE: ${{ vars.PROJECT_NAMESPACE || 'orderbooks' }}
|
||||
PROJECT_DEPLOYMENTS: ${{ vars.PROJECT_DEPLOYMENTS || 'orderbooks-collector' }}
|
||||
PROJECT_REGISTRY_SECRET_NAME: ${{ vars.PROJECT_REGISTRY_SECRET_NAME || 'orderbooks-registry-creds' }}
|
||||
REPO_CLONE_URL: ${{ github.server_url }}/${{ github.repository }}.git
|
||||
steps:
|
||||
- name: Install tooling
|
||||
run: |
|
||||
if command -v git >/dev/null 2>&1 && command -v kubectl >/dev/null 2>&1 && command -v python3 >/dev/null 2>&1; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if command -v apk >/dev/null 2>&1; then
|
||||
apk add --no-cache git kubectl python3
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if command -v apt-get >/dev/null 2>&1; then
|
||||
apt-get update
|
||||
apt-get install -y git curl ca-certificates python3
|
||||
curl -fsSLo /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
||||
chmod +x /usr/local/bin/kubectl
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "missing git/kubectl/python3 and no supported package manager found" >&2
|
||||
exit 1
|
||||
|
||||
- name: Prepare workspace
|
||||
run: |
|
||||
workspace_root="${RUNNER_TEMP:-/tmp}"
|
||||
workspace_dir="$(mktemp -d "${workspace_root%/}/orderbooks-deploy-XXXXXX")"
|
||||
echo "WORKSPACE_DIR=$workspace_dir" >> "$GITHUB_ENV"
|
||||
echo "runner workspace: $workspace_dir"
|
||||
|
||||
- name: Load kubeconfig
|
||||
run: |
|
||||
mkdir -p "$HOME/.kube"
|
||||
printf '%s' '${{ secrets.KUBECONFIG_B64 }}' | base64 -d > "$HOME/.kube/config"
|
||||
kubectl get ns
|
||||
|
||||
- name: Checkout repo
|
||||
env:
|
||||
REPO_TOKEN: ${{ github.token }}
|
||||
run: |
|
||||
git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" clone --depth=1 "${REPO_CLONE_URL}" "$WORKSPACE_DIR"
|
||||
cd "$WORKSPACE_DIR"
|
||||
current_sha="$(git rev-parse HEAD)"
|
||||
if [ "$current_sha" != "$GITHUB_SHA" ]; then
|
||||
git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" fetch --depth=1 origin "${GITHUB_SHA}"
|
||||
git checkout --detach "${GITHUB_SHA}"
|
||||
else
|
||||
git checkout --detach "$current_sha"
|
||||
fi
|
||||
git rev-parse HEAD
|
||||
|
||||
- name: Resolve deployment settings
|
||||
run: |
|
||||
if [ -z "${REGISTRY_HOST:-}" ]; then
|
||||
echo "REGISTRY_HOST repo variable is required" >&2
|
||||
exit 1
|
||||
fi
|
||||
IMAGE="$REGISTRY_HOST/$PROJECT_NAME:$IMAGE_TAG"
|
||||
BUILD_JOB="image-build-$(printf '%s' "$GITHUB_SHA" | cut -c1-12)"
|
||||
{
|
||||
echo "IMAGE=$IMAGE"
|
||||
echo "BUILD_JOB=$BUILD_JOB"
|
||||
} >> "$GITHUB_ENV"
|
||||
|
||||
- name: Ensure namespace exists
|
||||
run: |
|
||||
kubectl apply -f "$WORKSPACE_DIR/deploy/k8s/base/namespace.yaml"
|
||||
|
||||
- name: Build and push image in-cluster
|
||||
env:
|
||||
REPO_TOKEN: ${{ github.token }}
|
||||
run: |
|
||||
kubectl -n "$PROJECT_NAMESPACE" delete job "$BUILD_JOB" --ignore-not-found=true
|
||||
cat <<EOF | kubectl apply -f -
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: ${BUILD_JOB}
|
||||
namespace: ${PROJECT_NAMESPACE}
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
ttlSecondsAfterFinished: 3600
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
volumes:
|
||||
- name: workspace
|
||||
emptyDir: {}
|
||||
- name: registry-creds
|
||||
secret:
|
||||
secretName: ${PROJECT_REGISTRY_SECRET_NAME}
|
||||
items:
|
||||
- key: .dockerconfigjson
|
||||
path: config.json
|
||||
initContainers:
|
||||
- name: checkout
|
||||
image: alpine/git:2.47.2
|
||||
env:
|
||||
- name: REPO_TOKEN
|
||||
value: ${REPO_TOKEN}
|
||||
- name: REPO_CLONE_URL
|
||||
value: ${REPO_CLONE_URL}
|
||||
- name: GITHUB_SHA
|
||||
value: ${GITHUB_SHA}
|
||||
command: ["/bin/sh", "-lc"]
|
||||
args:
|
||||
- >-
|
||||
git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" clone --depth=1 "${REPO_CLONE_URL}" /workspace &&
|
||||
cd /workspace &&
|
||||
git -c credential.username=oauth2 -c http.extraHeader="Authorization: Bearer ${REPO_TOKEN}" fetch --depth=1 origin "${GITHUB_SHA}" &&
|
||||
git checkout --detach "${GITHUB_SHA}"
|
||||
volumeMounts:
|
||||
- name: workspace
|
||||
mountPath: /workspace
|
||||
containers:
|
||||
- name: kaniko
|
||||
image: gcr.io/kaniko-project/executor:v1.23.2-debug
|
||||
args:
|
||||
- --context=/workspace
|
||||
- --dockerfile=/workspace/Dockerfile
|
||||
- --destination=${IMAGE}
|
||||
- --cache=false
|
||||
volumeMounts:
|
||||
- name: workspace
|
||||
mountPath: /workspace
|
||||
- name: registry-creds
|
||||
mountPath: /kaniko/.docker
|
||||
EOF
|
||||
kubectl -n "$PROJECT_NAMESPACE" wait --for=condition=Complete --timeout=20m "job/$BUILD_JOB"
|
||||
kubectl -n "$PROJECT_NAMESPACE" logs "job/$BUILD_JOB"
|
||||
|
||||
- name: Apply release manifests and wait for rollout
|
||||
run: |
|
||||
kubectl kustomize "$WORKSPACE_DIR/deploy/k8s/base" \
|
||||
| IMAGE="$IMAGE" python3 -c 'import os, sys; sys.stdout.write(sys.stdin.read().replace("registry.doran.133011.xyz/orderbooks:bootstrap", os.environ["IMAGE"]))' \
|
||||
| kubectl apply -f -
|
||||
|
||||
printf '%s' "$PROJECT_DEPLOYMENTS" | tr ',' '\n' \
|
||||
| while IFS= read -r deployment; do
|
||||
[ -n "$deployment" ] || continue
|
||||
kubectl -n "$PROJECT_NAMESPACE" set image "deployment/$deployment" "*=$IMAGE"
|
||||
kubectl -n "$PROJECT_NAMESPACE" rollout status "deployment/$deployment" --timeout=300s
|
||||
done
|
||||
43
.gitignore
vendored
Normal file
43
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
# Local runtime data and evidence stay local
|
||||
data/
|
||||
artifacts/
|
||||
reports/
|
||||
orchestration/
|
||||
|
||||
# Python/cache/build noise
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
.ruff_cache/
|
||||
*.egg-info/
|
||||
build/
|
||||
dist/
|
||||
|
||||
# Environments and local config
|
||||
.venv/
|
||||
.env
|
||||
*.env
|
||||
!.dockerignore
|
||||
!.gitignore
|
||||
|
||||
# Kubernetes/rclone/secret material
|
||||
kubeconfig*
|
||||
*.kubeconfig
|
||||
rclone.conf
|
||||
**/rclone.conf
|
||||
*.pem
|
||||
*.key
|
||||
*.p12
|
||||
*.pfx
|
||||
id_rsa*
|
||||
id_ed25519*
|
||||
*mnemonic*
|
||||
*wallet*
|
||||
*credential*
|
||||
*secret*
|
||||
|
||||
# Editor/OS noise
|
||||
.DS_Store
|
||||
.idea/
|
||||
.vscode/
|
||||
91
AGENTS.md
Normal file
91
AGENTS.md
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
# Agent Instructions
|
||||
|
||||
Project: Cross-Market Live Orderbook Archive
|
||||
|
||||
This repository exists to preserve live market microstructure data that is usually lost: order books, spreads, liquidity, depth, timestamps, request metadata, and enough raw context to later decide whether a trading idea was observable, fillable, and reproducible at the time.
|
||||
|
||||
The first market is Polymarket. Future markets may include NEAR-related venues and other prediction or crypto markets, but do not build generic multi-market infrastructure before the second market exists.
|
||||
|
||||
## Active Collaboration Model
|
||||
|
||||
This project uses a two-role workflow:
|
||||
|
||||
- `orchestrator`: coordinates checkpoints with the user, keeps scope narrow, records decisions, reviews evidence, states gates, and decides the next smallest step.
|
||||
- `builder`: works in a separate session to implement the active checkpoint artifacts, run commands, collect evidence, and write manifests/reports.
|
||||
|
||||
The current primary chat session is the `orchestrator`. The orchestrator should not silently become the builder unless the user explicitly asks. The builder should treat `AGENTS.md`, `ROADMAP.md`, `docs/METHODOLOGY.md`, and the active checkpoint report as the durable source of instructions.
|
||||
|
||||
Hand-offs between orchestrator and builder must be written to disk under `orchestration/` or `reports/checkpoints/` when they contain decisions, scope changes, endpoint findings, or validation results. Chat-only instructions are not enough for project-critical state.
|
||||
|
||||
## Non-Negotiable Rules
|
||||
|
||||
1. Preserve raw data first. Raw API and websocket payloads are the source of truth. Derived datasets are secondary and must reference raw files.
|
||||
2. No trading. Do not add order placement, signing, private-key handling, wallet logic, strategy execution, or bot behavior.
|
||||
3. No secrets in the repo. Never commit API keys, rclone credentials, wallet material, cookies, or private endpoints.
|
||||
4. Every checkpoint needs durable evidence on disk: code or docs, config or run instructions, manifest/report, and validation evidence.
|
||||
5. Do not claim success without commands, outputs, files, checksums, or real collected data to support the claim.
|
||||
6. Do not delete mistakes. If an artifact is wrong, misleading, partial, or deprecated, preserve it and label it with a reason and replacement.
|
||||
7. Keep the scope narrow. No dashboard, database, ML, strategy, backtest, or generic framework until the roadmap gate allows it.
|
||||
8. Public data only unless a later checkpoint explicitly documents why authenticated public-data access is required.
|
||||
9. "Production-ready" is forbidden until the collector has completed a documented 24h soak test with acceptable quality.
|
||||
|
||||
## Expected Workflow
|
||||
|
||||
For each checkpoint:
|
||||
|
||||
1. Define the smallest useful checkpoint.
|
||||
2. Build only what is needed for that checkpoint.
|
||||
3. Validate with real commands and, when applicable, real public data.
|
||||
4. Write a machine-readable manifest and a short markdown note.
|
||||
5. State PASS, FAIL, or BLOCKED.
|
||||
6. Identify the strongest fake-progress risk.
|
||||
7. Recommend the next smallest step.
|
||||
8. Stop only when a real user or orchestrator decision is needed.
|
||||
|
||||
## Repository Conventions
|
||||
|
||||
- `scripts/`: executable probes, discovery scripts, collectors, normalizers, and upload helpers.
|
||||
- `config/`: example configuration only. Real secrets and machine-local config stay outside git.
|
||||
- `docs/`: durable methodology, data contracts, operational runbooks, and endpoint notes.
|
||||
- `orchestration/prompts/`: prompts and templates used by future agents.
|
||||
- `data/probes/`: bounded endpoint probe outputs and probe notes.
|
||||
- `data/discovery/`: market discovery outputs and manifests.
|
||||
- `data/live_sample/`: short sample collector runs.
|
||||
- `data/normalized_sample/`: derived sample outputs generated from raw samples.
|
||||
- `data/manifests/`: machine-readable manifests for probes, collectors, normalization, uploads, and checkpoints.
|
||||
- `reports/`: human-readable checkpoint, soak test, and incident reports.
|
||||
- `systemd/`: VPS runtime units when added.
|
||||
|
||||
The initial Polymarket implementation should remain simple scripts until the collector works. Introduce `collectors/<market_name>/` only when adding a second market or when duplication proves painful.
|
||||
|
||||
## Artifact Status Labels
|
||||
|
||||
Every durable artifact should be treated as one of:
|
||||
|
||||
- `valid`: current and usable.
|
||||
- `partial`: useful but incomplete.
|
||||
- `deprecated`: superseded by a newer artifact.
|
||||
- `invalid`: known to be wrong or misleading.
|
||||
|
||||
When marking an artifact `deprecated` or `invalid`, write a sibling markdown note or manifest entry with:
|
||||
|
||||
- original artifact path
|
||||
- status
|
||||
- reason
|
||||
- replacement path, if any
|
||||
- labeled_at_utc
|
||||
- labeled_by
|
||||
|
||||
Do not remove the original artifact unless the user explicitly asks and there is a written reason.
|
||||
|
||||
## Adding New Market Connectors Later
|
||||
|
||||
Before adding a second market, Polymarket must have working discovery, raw order-book collection, Google Drive offload, and a 24h soak test.
|
||||
|
||||
When the gate is met:
|
||||
|
||||
1. Create `collectors/<market_name>/` for market-specific code.
|
||||
2. Keep shared code minimal and concrete.
|
||||
3. Reuse the same raw-first file layout and manifest format.
|
||||
4. Document endpoint quirks, timestamp semantics, rate limits, and schema differences in `docs/`.
|
||||
5. Avoid abstract base classes until at least two real collectors expose repeated code that is painful to maintain.
|
||||
28
Dockerfile
Normal file
28
Dockerfile
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
FROM python:3.12-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
ORDERBOOKS_APP_DIR=/app \
|
||||
ORDERBOOKS_DATA_DIR=/var/lib/orderbooks \
|
||||
ORDERBOOKS_PYTHON=python3
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends bash ca-certificates rclone \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& groupadd --system --gid 10001 orderbooks \
|
||||
&& useradd --system --uid 10001 --gid 10001 --home-dir /var/lib/orderbooks --shell /usr/sbin/nologin orderbooks
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY AGENTS.md ROADMAP.md ./
|
||||
COPY config/ config/
|
||||
COPY docs/ docs/
|
||||
COPY scripts/ scripts/
|
||||
|
||||
RUN chmod +x scripts/*.sh \
|
||||
&& mkdir -p /var/lib/orderbooks/discovery /var/lib/orderbooks/raw_orderbooks /var/lib/orderbooks/manifests \
|
||||
&& chown -R orderbooks:orderbooks /var/lib/orderbooks /app
|
||||
|
||||
USER 10001:10001
|
||||
|
||||
CMD ["/bin/bash", "/app/scripts/run_polymarket_collector_loop.sh"]
|
||||
212
ROADMAP.md
Normal file
212
ROADMAP.md
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
# Roadmap
|
||||
|
||||
Project: Cross-Market Live Orderbook Archive
|
||||
|
||||
Goal: build a reliable, minimal, always-on archive of live market microstructure data so future research agents can test whether strategies were actually observable, fillable, and reproducible in real time.
|
||||
|
||||
The roadmap is checkpoint-driven. Each checkpoint must leave durable artifacts, validation evidence, and an explicit gate result.
|
||||
|
||||
## Current Status
|
||||
|
||||
- Latest completed checkpoint: Checkpoint 7, Google Drive Offload
|
||||
- Latest gate: PASS
|
||||
- Next checkpoint: Checkpoint 8, 24h Soak Test Plan
|
||||
- Initial market: Polymarket
|
||||
- Future market work: gated until Polymarket is stable
|
||||
|
||||
## Checkpoint 1: Project Scaffold And Methodology
|
||||
|
||||
Goal: create the minimum repository structure and rules that keep future agents on track.
|
||||
|
||||
Artifacts:
|
||||
|
||||
- `AGENTS.md`
|
||||
- `ROADMAP.md`
|
||||
- `docs/METHODOLOGY.md`
|
||||
- `docs/DATA_CONTRACT.md`
|
||||
- `docs/OPERATIONS.md`
|
||||
- `orchestration/prompts/`
|
||||
|
||||
Requirements:
|
||||
|
||||
- Define project goal.
|
||||
- Define anti-fake-progress rules.
|
||||
- Define raw-first storage policy.
|
||||
- Define checkpoint reporting format.
|
||||
- Define no-trading/no-private-key policy.
|
||||
- Define how to label deprecated or misleading artifacts instead of deleting them.
|
||||
- Define how new market connectors should be added later.
|
||||
|
||||
Pass condition: the repo contains durable project rules and the next checkpoint is specific enough to execute.
|
||||
|
||||
## Checkpoint 2: Polymarket Public Data Source Probe
|
||||
|
||||
Goal: determine exactly which public Polymarket endpoints can support live collection.
|
||||
|
||||
Questions:
|
||||
|
||||
- How to discover active Polymarket markets?
|
||||
- How to filter BTC up/down markets?
|
||||
- How to resolve conditionId and token IDs?
|
||||
- How to fetch current order book for one token?
|
||||
- Is there a batch order-book endpoint?
|
||||
- Is there a market websocket for order-book updates?
|
||||
- Is there a trade websocket or recent trades endpoint?
|
||||
- What rate limits are documented or observed?
|
||||
- What fields are returned?
|
||||
- What timestamps exist?
|
||||
|
||||
Artifacts:
|
||||
|
||||
- `scripts/probe_polymarket_public_sources.py`
|
||||
- `data/probes/polymarket_public_sources_probe_v1.json`
|
||||
- `data/probes/polymarket_public_sources_probe_v1.md`
|
||||
|
||||
Pass condition: we know the exact endpoint set and can fetch at least one active market metadata record and one current order book.
|
||||
|
||||
## Checkpoint 3: Minimal BTC Market Discovery
|
||||
|
||||
Goal: build a small script that finds active BTC up/down Polymarket markets and resolves both outcome token IDs.
|
||||
|
||||
Artifacts:
|
||||
|
||||
- `scripts/discover_polymarket_btc_markets.py`
|
||||
- `data/discovery/polymarket_btc_markets_latest.json`
|
||||
- `data/discovery/polymarket_btc_markets_manifest.json`
|
||||
- `data/discovery/polymarket_btc_markets.md`
|
||||
|
||||
Requirements:
|
||||
|
||||
- Public endpoints only.
|
||||
- No trading.
|
||||
- No API keys unless strictly needed for public data.
|
||||
- Never store secrets in the repo.
|
||||
- Preserve raw metadata responses.
|
||||
- Write normalized market records with slug, question, conditionId, token IDs, outcomes, times, status, source, and `fetched_at_utc`.
|
||||
|
||||
Pass condition: the script reliably outputs currently active BTC markets with token IDs.
|
||||
|
||||
## Checkpoint 4: Minimal Orderbook Snapshot Collector
|
||||
|
||||
Goal: collect raw order-book snapshots for active BTC markets at a fixed interval.
|
||||
|
||||
Artifacts:
|
||||
|
||||
- `scripts/collect_polymarket_orderbooks.py`
|
||||
- `config/polymarket_collector.example.yaml`
|
||||
- `data/live_sample/...`
|
||||
- `data/manifests/orderbook_collector_sample_manifest.json`
|
||||
- `docs/POLYMARKET_COLLECTOR.md`
|
||||
|
||||
Requirements:
|
||||
|
||||
- Collect active BTC markets only.
|
||||
- Fetch order books for both outcome tokens.
|
||||
- Store raw API responses as gzip JSONL.
|
||||
- Add local `collected_at_utc`, collector version, endpoint URL, and request params.
|
||||
- Rotate files by hour or run.
|
||||
- Include a manifest with timing, markets, request counts, status codes, rows, output files, and checksums.
|
||||
- Handle graceful shutdown and rate limits.
|
||||
- Do not add a database.
|
||||
|
||||
Pass condition: a 5-10 minute sample run creates valid compressed raw snapshots and a manifest.
|
||||
|
||||
## Checkpoint 5: Normalized Snapshot Extract
|
||||
|
||||
Goal: create a derived normalized dataset from raw snapshots while preserving raw files as source of truth.
|
||||
|
||||
Artifacts:
|
||||
|
||||
- `scripts/normalize_polymarket_orderbooks.py`
|
||||
- `data/normalized_sample/...`
|
||||
- `data/manifests/orderbook_normalization_sample_manifest.json`
|
||||
- `docs/ORDERBOOK_SCHEMA.md`
|
||||
|
||||
Pass condition: a sample raw file can be normalized and basic sanity checks pass.
|
||||
|
||||
## Checkpoint 6: VPS Runtime Package
|
||||
|
||||
Goal: make the collector deployable on a small VPS.
|
||||
|
||||
Artifacts:
|
||||
|
||||
- `systemd/polymarket-orderbook-collector.service`
|
||||
- `config/polymarket_collector.vps.example.yaml`
|
||||
- `scripts/run_polymarket_collector_cycle.sh`
|
||||
- `docs/VPS_DEPLOYMENT.md`
|
||||
|
||||
Uploader service and timer units are deferred to Checkpoint 7 with Google Drive
|
||||
offload. Creating empty uploader units in Checkpoint 6 would be fake progress.
|
||||
|
||||
Pass condition: a user can follow docs on a VPS and run the collector.
|
||||
|
||||
## Checkpoint 7: Google Drive Offload
|
||||
|
||||
Goal: add periodic upload to Google Drive using `rclone`.
|
||||
|
||||
Artifacts:
|
||||
|
||||
- `scripts/upload_archive_rclone.sh`
|
||||
- `config/rclone.example.md`
|
||||
- `docs/GOOGLE_DRIVE_OFFLOAD.md`
|
||||
- sample upload manifest format
|
||||
|
||||
Pass condition: a dry-run and a real small test upload succeed and are documented.
|
||||
|
||||
## Checkpoint 8: 24h Soak Test Plan
|
||||
|
||||
Goal: run the collector for a real 24h period and validate reliability.
|
||||
|
||||
Artifacts:
|
||||
|
||||
- `reports/soak_test_YYYY-MM-DD.md`
|
||||
- `data/manifests/...`
|
||||
|
||||
Metrics:
|
||||
|
||||
- uptime
|
||||
- markets tracked
|
||||
- total snapshots
|
||||
- missed interval estimate
|
||||
- API errors
|
||||
- rate limits
|
||||
- file sizes
|
||||
- compression ratio
|
||||
- Google Drive upload status
|
||||
- restart behavior
|
||||
- disk usage
|
||||
- data quality checks
|
||||
|
||||
Pass condition: a 24h run completes with acceptable data quality and documented issues.
|
||||
|
||||
## Checkpoint 9: Add Second Market Only After Polymarket Is Stable
|
||||
|
||||
Goal: prepare for NEAR or another market only after Polymarket collector reliability is proven.
|
||||
|
||||
Do not start this checkpoint until:
|
||||
|
||||
- Polymarket discovery works.
|
||||
- Polymarket order-book collection works.
|
||||
- Google Drive offload works.
|
||||
- The 24h soak test is complete.
|
||||
|
||||
Architecture principles:
|
||||
|
||||
- Use `collectors/<market_name>/` only when adding the second market.
|
||||
- Keep shared code minimal.
|
||||
- Avoid abstract base classes until duplication is painful.
|
||||
- Keep raw-first, normalized-second, manifest-always file format consistent across markets.
|
||||
|
||||
## Anti-Fake-Progress Gates
|
||||
|
||||
- No dashboard before 24h data reliability.
|
||||
- No database before the file archive becomes painful.
|
||||
- No strategy or backtest code in this project.
|
||||
- No live trading.
|
||||
- No generic multi-market abstraction before the second market exists.
|
||||
- No claiming "production-ready" before a 24h soak test.
|
||||
- No deleting bad artifacts; label them deprecated or invalid and write why.
|
||||
|
||||
## Next Smallest Step
|
||||
|
||||
Checkpoint 2 is next. It should inspect official Polymarket docs and perform bounded public endpoint probes to determine the exact live collection sources, schemas, timestamps, and rate-limit behavior.
|
||||
20
config/polymarket_collector.example.yaml
Normal file
20
config/polymarket_collector.example.yaml
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
# Example config for the bounded Checkpoint 4 Polymarket order-book sample.
|
||||
# This file contains no secrets. The collector reads only public endpoints.
|
||||
|
||||
discovery_path: data/discovery/polymarket_btc_markets_latest.json
|
||||
output_dir: data/live_sample
|
||||
manifest_path: data/manifests/orderbook_collector_sample_manifest.json
|
||||
|
||||
# Keep the default sample deliberately small to avoid unnecessary endpoint load.
|
||||
market_limit: 2
|
||||
interval_seconds: 30
|
||||
duration_seconds: 300
|
||||
|
||||
clob_books_url: https://clob.polymarket.com/books
|
||||
request_timeout_seconds: 15
|
||||
max_retries: 2
|
||||
backoff_seconds: 2
|
||||
|
||||
# Do not start tracking markets too close to their end time. Default covers
|
||||
# the 5-minute sample duration plus a 2-minute buffer.
|
||||
market_end_safety_seconds: 420
|
||||
17
config/polymarket_collector.vps.example.yaml
Normal file
17
config/polymarket_collector.vps.example.yaml
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
# Checkpoint 6 VPS example config for the raw Polymarket order-book collector.
|
||||
# Copy to /etc/orderbooks/polymarket_collector.vps.yaml on a VPS and edit paths
|
||||
# if the service uses a different data directory.
|
||||
|
||||
discovery_path: /var/lib/orderbooks/discovery/polymarket_btc_markets_latest.json
|
||||
output_dir: /var/lib/orderbooks/raw_orderbooks
|
||||
manifest_path: /var/lib/orderbooks/manifests/polymarket_orderbook_collector_latest.json
|
||||
|
||||
market_limit: 2
|
||||
interval_seconds: 30
|
||||
duration_seconds: 300
|
||||
market_end_safety_seconds: 420
|
||||
|
||||
clob_books_url: https://clob.polymarket.com/books
|
||||
request_timeout_seconds: 15
|
||||
max_retries: 2
|
||||
backoff_seconds: 2
|
||||
76
config/rclone.example.md
Normal file
76
config/rclone.example.md
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
# rclone Configuration Example
|
||||
|
||||
Status: valid
|
||||
|
||||
This file documents the expected `rclone` setup for Checkpoint 7. It is not an
|
||||
`rclone.conf` file and must not be copied into the repository with private auth
|
||||
material.
|
||||
|
||||
## Remote Name
|
||||
|
||||
The examples use this remote path:
|
||||
|
||||
```text
|
||||
gdrive:orderbooks/polymarket
|
||||
```
|
||||
|
||||
You may choose another remote name or folder. The uploader reads the destination
|
||||
from:
|
||||
|
||||
```text
|
||||
ORDERBOOKS_RCLONE_DEST
|
||||
```
|
||||
|
||||
For the systemd service, set it in:
|
||||
|
||||
```text
|
||||
/etc/orderbooks/orderbook-uploader.env
|
||||
```
|
||||
|
||||
Example:
|
||||
|
||||
```text
|
||||
ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
|
||||
```
|
||||
|
||||
Do not place private auth files, browser tokens, API keys, wallet material, or
|
||||
session material in this repository.
|
||||
|
||||
## Configure Google Drive Outside The Repo
|
||||
|
||||
Install `rclone` on the VPS, then configure the remote as the service user or
|
||||
with a root-managed config path that the service can read:
|
||||
|
||||
```sh
|
||||
sudo apt-get install -y rclone
|
||||
sudo -u orderbooks rclone config
|
||||
sudo -u orderbooks rclone lsd gdrive:
|
||||
```
|
||||
|
||||
If the service user uses the default rclone config path, keep that file outside
|
||||
the repository under the service user's home/config directory.
|
||||
|
||||
## Uploader Environment File
|
||||
|
||||
Create:
|
||||
|
||||
```text
|
||||
/etc/orderbooks/orderbook-uploader.env
|
||||
```
|
||||
|
||||
Minimal example:
|
||||
|
||||
```text
|
||||
ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
|
||||
```
|
||||
|
||||
Optional overrides:
|
||||
|
||||
```text
|
||||
ORDERBOOKS_UPLOAD_DATA_DIR=/var/lib/orderbooks
|
||||
ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600
|
||||
ORDERBOOKS_UPLOAD_RETENTION_DAYS=7
|
||||
ORDERBOOKS_RCLONE_BIN=/usr/bin/rclone
|
||||
```
|
||||
|
||||
The environment file belongs on the VPS. Do not commit a machine-local version.
|
||||
25
deploy/k8s/base/configmap.yaml
Normal file
25
deploy/k8s/base/configmap.yaml
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: orderbooks-collector-config
|
||||
namespace: orderbooks
|
||||
labels:
|
||||
app.kubernetes.io/name: orderbooks
|
||||
app.kubernetes.io/part-of: orderbooks
|
||||
app.kubernetes.io/component: collector
|
||||
app.kubernetes.io/managed-by: kustomize
|
||||
data:
|
||||
polymarket_collector.yaml: |
|
||||
discovery_path: /var/lib/orderbooks/discovery/polymarket_btc_markets_latest.json
|
||||
output_dir: /var/lib/orderbooks/raw_orderbooks
|
||||
manifest_path: /var/lib/orderbooks/manifests/polymarket_orderbook_collector_latest.json
|
||||
|
||||
market_limit: 2
|
||||
interval_seconds: 30
|
||||
duration_seconds: 300
|
||||
market_end_safety_seconds: 420
|
||||
|
||||
clob_books_url: https://clob.polymarket.com/books
|
||||
request_timeout_seconds: 15
|
||||
max_retries: 2
|
||||
backoff_seconds: 2
|
||||
92
deploy/k8s/base/cronjob-uploader.yaml
Normal file
92
deploy/k8s/base/cronjob-uploader.yaml
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: orderbooks-uploader
|
||||
namespace: orderbooks
|
||||
labels:
|
||||
app.kubernetes.io/name: orderbooks
|
||||
app.kubernetes.io/part-of: orderbooks
|
||||
app.kubernetes.io/component: uploader
|
||||
spec:
|
||||
schedule: "*/15 * * * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 3
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
ttlSecondsAfterFinished: 86400
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: orderbooks
|
||||
app.kubernetes.io/part-of: orderbooks
|
||||
app.kubernetes.io/component: uploader
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
imagePullSecrets:
|
||||
- name: orderbooks-registry-creds
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 10001
|
||||
runAsGroup: 10001
|
||||
fsGroup: 10001
|
||||
fsGroupChangePolicy: OnRootMismatch
|
||||
containers:
|
||||
- name: uploader
|
||||
image: registry.doran.133011.xyz/orderbooks:bootstrap
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- /bin/bash
|
||||
- /app/scripts/upload_archive_rclone.sh
|
||||
- --execute
|
||||
env:
|
||||
- name: ORDERBOOKS_DATA_DIR
|
||||
value: /var/lib/orderbooks
|
||||
- name: ORDERBOOKS_UPLOAD_DATA_DIR
|
||||
value: /var/lib/orderbooks
|
||||
- name: ORDERBOOKS_UPLOAD_RAW_DIR
|
||||
value: /var/lib/orderbooks/raw_orderbooks
|
||||
- name: ORDERBOOKS_UPLOAD_SOURCE_MANIFEST_DIR
|
||||
value: /var/lib/orderbooks/manifests
|
||||
- name: ORDERBOOKS_UPLOAD_MANIFEST_DIR
|
||||
value: /var/lib/orderbooks/manifests
|
||||
- name: ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS
|
||||
value: "600"
|
||||
- name: ORDERBOOKS_UPLOAD_RETENTION_DAYS
|
||||
value: "7"
|
||||
- name: ORDERBOOKS_RCLONE_BIN
|
||||
value: /usr/bin/rclone
|
||||
- name: ORDERBOOKS_RCLONE_DEST
|
||||
value: gdrive:orderbooks/polymarket
|
||||
- name: RCLONE_CONFIG
|
||||
value: /etc/rclone/rclone.conf
|
||||
volumeMounts:
|
||||
- name: orderbooks-data
|
||||
mountPath: /var/lib/orderbooks
|
||||
- name: rclone-config
|
||||
mountPath: /etc/rclone/rclone.conf
|
||||
subPath: rclone.conf
|
||||
readOnly: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
volumes:
|
||||
- name: orderbooks-data
|
||||
persistentVolumeClaim:
|
||||
claimName: orderbooks-data
|
||||
- name: rclone-config
|
||||
secret:
|
||||
secretName: orderbooks-rclone-config
|
||||
items:
|
||||
- key: rclone.conf
|
||||
path: rclone.conf
|
||||
86
deploy/k8s/base/deployment-collector.yaml
Normal file
86
deploy/k8s/base/deployment-collector.yaml
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: orderbooks-collector
|
||||
namespace: orderbooks
|
||||
labels:
|
||||
app.kubernetes.io/name: orderbooks
|
||||
app.kubernetes.io/part-of: orderbooks
|
||||
app.kubernetes.io/component: collector
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: orderbooks
|
||||
app.kubernetes.io/component: collector
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: orderbooks
|
||||
app.kubernetes.io/part-of: orderbooks
|
||||
app.kubernetes.io/component: collector
|
||||
spec:
|
||||
terminationGracePeriodSeconds: 120
|
||||
imagePullSecrets:
|
||||
- name: orderbooks-registry-creds
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 10001
|
||||
runAsGroup: 10001
|
||||
fsGroup: 10001
|
||||
fsGroupChangePolicy: OnRootMismatch
|
||||
containers:
|
||||
- name: collector
|
||||
image: registry.doran.133011.xyz/orderbooks:bootstrap
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- /bin/bash
|
||||
- /app/scripts/run_polymarket_collector_loop.sh
|
||||
env:
|
||||
- name: ORDERBOOKS_APP_DIR
|
||||
value: /app
|
||||
- name: ORDERBOOKS_PYTHON
|
||||
value: python3
|
||||
- name: ORDERBOOKS_DATA_DIR
|
||||
value: /var/lib/orderbooks
|
||||
- name: ORDERBOOKS_COLLECTOR_CONFIG
|
||||
value: /etc/orderbooks/polymarket_collector.yaml
|
||||
- name: ORDERBOOKS_DISCOVERY_DIR
|
||||
value: /var/lib/orderbooks/discovery
|
||||
- name: ORDERBOOKS_OUTPUT_DIR
|
||||
value: /var/lib/orderbooks/raw_orderbooks
|
||||
- name: ORDERBOOKS_MANIFEST_DIR
|
||||
value: /var/lib/orderbooks/manifests
|
||||
- name: ORDERBOOKS_LOOP_SLEEP_SECONDS
|
||||
value: "15"
|
||||
volumeMounts:
|
||||
- name: orderbooks-data
|
||||
mountPath: /var/lib/orderbooks
|
||||
- name: collector-config
|
||||
mountPath: /etc/orderbooks/polymarket_collector.yaml
|
||||
subPath: polymarket_collector.yaml
|
||||
readOnly: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
volumes:
|
||||
- name: orderbooks-data
|
||||
persistentVolumeClaim:
|
||||
claimName: orderbooks-data
|
||||
- name: collector-config
|
||||
configMap:
|
||||
name: orderbooks-collector-config
|
||||
items:
|
||||
- key: polymarket_collector.yaml
|
||||
path: polymarket_collector.yaml
|
||||
9
deploy/k8s/base/kustomization.yaml
Normal file
9
deploy/k8s/base/kustomization.yaml
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: orderbooks
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- configmap.yaml
|
||||
- pvc.yaml
|
||||
- deployment-collector.yaml
|
||||
- cronjob-uploader.yaml
|
||||
7
deploy/k8s/base/namespace.yaml
Normal file
7
deploy/k8s/base/namespace.yaml
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: orderbooks
|
||||
labels:
|
||||
app.kubernetes.io/name: orderbooks
|
||||
app.kubernetes.io/part-of: orderbooks
|
||||
15
deploy/k8s/base/pvc.yaml
Normal file
15
deploy/k8s/base/pvc.yaml
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: orderbooks-data
|
||||
namespace: orderbooks
|
||||
labels:
|
||||
app.kubernetes.io/name: orderbooks
|
||||
app.kubernetes.io/part-of: orderbooks
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
storageClassName: local-path
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
168
docs/DATA_CONTRACT.md
Normal file
168
docs/DATA_CONTRACT.md
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
# Data Contract
|
||||
|
||||
The archive is raw-first. Raw market data must be preserved before normalization, aggregation, upload, or analysis.
|
||||
|
||||
## Storage Principles
|
||||
|
||||
- Store the raw response payload exactly as received whenever practical.
|
||||
- Add collector metadata beside the raw payload, not inside it.
|
||||
- Use UTC timestamps in ISO 8601 format with a `Z` suffix.
|
||||
- Use gzip JSONL for high-frequency snapshot data.
|
||||
- Rotate live collection files by hour or run.
|
||||
- Include checksums in manifests for all closed files.
|
||||
- Keep normalized files derived and traceable back to raw files.
|
||||
- Never store secrets, cookies, private keys, wallet material, or authenticated session state.
|
||||
|
||||
## Directory Layout
|
||||
|
||||
Initial expected layout:
|
||||
|
||||
```text
|
||||
data/
|
||||
probes/
|
||||
discovery/
|
||||
live_sample/
|
||||
normalized_sample/
|
||||
manifests/
|
||||
reports/
|
||||
checkpoints/
|
||||
```
|
||||
|
||||
Future sustained collection layout:
|
||||
|
||||
```text
|
||||
data/
|
||||
raw/
|
||||
polymarket/
|
||||
orderbooks/
|
||||
YYYY/
|
||||
MM/
|
||||
DD/
|
||||
HH/
|
||||
polymarket_orderbooks_YYYYMMDDTHHMMSSZ.jsonl.gz
|
||||
normalized/
|
||||
polymarket/
|
||||
orderbooks/
|
||||
YYYY/
|
||||
MM/
|
||||
DD/
|
||||
polymarket_orderbooks_normalized_YYYYMMDD.jsonl.gz
|
||||
manifests/
|
||||
```
|
||||
|
||||
Do not create a database until compressed file archives are proven painful.
|
||||
|
||||
## Raw Orderbook Snapshot Envelope
|
||||
|
||||
Checkpoint 4 should store one JSON object per line using this envelope or a documented successor:
|
||||
|
||||
```json
|
||||
{
|
||||
"schema_name": "raw_orderbook_snapshot",
|
||||
"schema_version": 1,
|
||||
"collector": {
|
||||
"name": "polymarket_orderbook_collector",
|
||||
"version": "0.1.0"
|
||||
},
|
||||
"market": {
|
||||
"market_name": "polymarket",
|
||||
"market_slug": "example-slug",
|
||||
"condition_id": "0x...",
|
||||
"token_id": "123",
|
||||
"outcome": "Yes"
|
||||
},
|
||||
"collection": {
|
||||
"collected_at_utc": "2026-04-14T20:53:49Z",
|
||||
"sequence": 1
|
||||
},
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "https://example.invalid/orderbook",
|
||||
"params": {
|
||||
"token_id": "123"
|
||||
},
|
||||
"status_code": 200,
|
||||
"duration_ms": 123
|
||||
},
|
||||
"raw": {}
|
||||
}
|
||||
```
|
||||
|
||||
`raw` is the unmodified response payload. If the endpoint returns text or bytes, record encoding and store a lossless representation.
|
||||
|
||||
## Discovery Record Fields
|
||||
|
||||
Checkpoint 3 normalized market records should include:
|
||||
|
||||
- `market_name`
|
||||
- `market_slug`
|
||||
- `title` or `question`
|
||||
- `condition_id`
|
||||
- `tokens`
|
||||
- `outcomes`
|
||||
- `start_time_utc`, if available
|
||||
- `end_time_utc`, if available
|
||||
- `active`
|
||||
- `closed`
|
||||
- `endpoint_source`
|
||||
- `fetched_at_utc`
|
||||
- `raw_ref`
|
||||
|
||||
`tokens` should preserve the mapping between outcome labels and token IDs.
|
||||
|
||||
## Normalized Snapshot Fields
|
||||
|
||||
Checkpoint 5 normalized records should include:
|
||||
|
||||
- `market_name`
|
||||
- `market_slug`
|
||||
- `condition_id`
|
||||
- `token_id`
|
||||
- `outcome`
|
||||
- `collected_at_utc`
|
||||
- `best_bid`
|
||||
- `best_ask`
|
||||
- `spread`
|
||||
- `midpoint`
|
||||
- `bid_depth_total`
|
||||
- `ask_depth_total`
|
||||
- `bid_depth_within_1c`
|
||||
- `ask_depth_within_1c`
|
||||
- `bid_depth_within_2c`
|
||||
- `ask_depth_within_2c`
|
||||
- `bid_depth_within_5c`
|
||||
- `ask_depth_within_5c`
|
||||
- `raw_file`
|
||||
- `raw_line_number`, when feasible
|
||||
|
||||
Normalized data is invalid if it cannot reference the raw source record.
|
||||
|
||||
## Manifest Requirements
|
||||
|
||||
Collection and transformation manifests should include:
|
||||
|
||||
- manifest schema name and version
|
||||
- checkpoint or process name
|
||||
- start and end timestamps
|
||||
- market names and market IDs tracked
|
||||
- input files
|
||||
- output files
|
||||
- request counts
|
||||
- success and failure counts
|
||||
- status-code counts
|
||||
- row counts
|
||||
- checksums for closed files
|
||||
- command used
|
||||
- config path or config digest
|
||||
- warnings and known gaps
|
||||
- gate status
|
||||
|
||||
Checksums should use SHA-256 unless a later report explains why another hash is used.
|
||||
|
||||
## Timestamp Policy
|
||||
|
||||
- `collected_at_utc`: local collector timestamp taken as close as possible to receipt of data.
|
||||
- `fetched_at_utc`: timestamp for metadata or discovery fetches.
|
||||
- Endpoint-provided timestamps must be preserved under their original field names in `raw`.
|
||||
- If endpoint timestamp semantics are unclear, write the ambiguity into the probe report.
|
||||
|
||||
294
docs/GOOGLE_DRIVE_OFFLOAD.md
Normal file
294
docs/GOOGLE_DRIVE_OFFLOAD.md
Normal file
|
|
@ -0,0 +1,294 @@
|
|||
# Google Drive Offload
|
||||
|
||||
Status: valid
|
||||
|
||||
This document covers Checkpoint 7: offloading closed raw collector files and
|
||||
manifests to Google Drive with `rclone`.
|
||||
|
||||
This checkpoint does not prove production readiness or 24/7 reliability. A real
|
||||
small upload must be run with a configured remote, and the later 24h soak test
|
||||
must still pass.
|
||||
|
||||
## Scope
|
||||
|
||||
Included:
|
||||
|
||||
- `scripts/upload_archive_rclone.sh`
|
||||
- `systemd/polymarket-orderbook-uploader.service`
|
||||
- `systemd/polymarket-orderbook-uploader.timer`
|
||||
- dry-run mode by default
|
||||
- real upload only with `--execute`
|
||||
- rclone verification with `rclone check`
|
||||
- per-run upload manifests
|
||||
- optional local cleanup only after successful verification
|
||||
|
||||
Excluded:
|
||||
|
||||
- dashboards
|
||||
- databases
|
||||
- strategies or backtests
|
||||
- trading, signing, order placement, or wallet logic
|
||||
- hardcoded private auth material
|
||||
|
||||
## Install rclone
|
||||
|
||||
On Ubuntu or Debian:
|
||||
|
||||
```sh
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y rclone
|
||||
```
|
||||
|
||||
Confirm:
|
||||
|
||||
```sh
|
||||
rclone version
|
||||
```
|
||||
|
||||
## Configure A Google Drive Remote
|
||||
|
||||
Configure the remote outside this repository. For a service-user setup:
|
||||
|
||||
```sh
|
||||
sudo -u orderbooks rclone config
|
||||
sudo -u orderbooks rclone lsd gdrive:
|
||||
```
|
||||
|
||||
The example remote path is:
|
||||
|
||||
```text
|
||||
gdrive:orderbooks/polymarket
|
||||
```
|
||||
|
||||
Any valid `rclone` destination may be used. The uploader reads it from:
|
||||
|
||||
```text
|
||||
ORDERBOOKS_RCLONE_DEST
|
||||
```
|
||||
|
||||
For systemd, create:
|
||||
|
||||
```text
|
||||
/etc/orderbooks/orderbook-uploader.env
|
||||
```
|
||||
|
||||
Example:
|
||||
|
||||
```text
|
||||
ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
|
||||
```
|
||||
|
||||
Do not commit the machine-local rclone config or any private auth material.
|
||||
|
||||
## What Gets Uploaded
|
||||
|
||||
By default the script targets:
|
||||
|
||||
| Source | Default path |
|
||||
| --- | --- |
|
||||
| raw collector files | `/var/lib/orderbooks/raw_orderbooks` |
|
||||
| collector manifests | `/var/lib/orderbooks/manifests` |
|
||||
|
||||
It does not target normalized sample files by default.
|
||||
|
||||
Files modified within the last 10 minutes are skipped to avoid active collector
|
||||
files:
|
||||
|
||||
```text
|
||||
ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600
|
||||
```
|
||||
|
||||
The script preserves repository/data-directory relative paths on the remote. For
|
||||
example:
|
||||
|
||||
```text
|
||||
/var/lib/orderbooks/raw_orderbooks/polymarket/orderbooks/<run_id>/file.jsonl.gz
|
||||
```
|
||||
|
||||
uploads to:
|
||||
|
||||
```text
|
||||
<remote>/raw_orderbooks/polymarket/orderbooks/<run_id>/file.jsonl.gz
|
||||
```
|
||||
|
||||
## Dry Run
|
||||
|
||||
Dry-run is the default. It plans files, stages a temporary copy, invokes
|
||||
`rclone copy --dry-run`, and writes an upload manifest.
|
||||
|
||||
Example for a VPS:
|
||||
|
||||
```sh
|
||||
/opt/orderbooks/scripts/upload_archive_rclone.sh \
|
||||
--data-dir /var/lib/orderbooks \
|
||||
--dest "$ORDERBOOKS_RCLONE_DEST"
|
||||
```
|
||||
|
||||
Example against the repository sample data:
|
||||
|
||||
```sh
|
||||
scripts/upload_archive_rclone.sh \
|
||||
--data-dir data \
|
||||
--dest gdrive:orderbooks/polymarket/checkpoint7-test \
|
||||
--manifest-path data/manifests/upload_archive_real_test_dry_run_manifest.json \
|
||||
--min-age-seconds 0 \
|
||||
--rclone-bin /usr/bin/rclone
|
||||
```
|
||||
|
||||
Dry-run does not prove remote write access.
|
||||
|
||||
## Execute Upload
|
||||
|
||||
Run a real upload only after the remote is configured and the dry-run plan looks
|
||||
right:
|
||||
|
||||
```sh
|
||||
/opt/orderbooks/scripts/upload_archive_rclone.sh \
|
||||
--execute \
|
||||
--data-dir /var/lib/orderbooks \
|
||||
--dest "$ORDERBOOKS_RCLONE_DEST"
|
||||
```
|
||||
|
||||
The script runs:
|
||||
|
||||
```text
|
||||
rclone copy <staged files> <remote> --checksum
|
||||
rclone check <staged files> <remote> --one-way --checksum
|
||||
```
|
||||
|
||||
The upload gate is `PASS` only when the copy succeeds and verification succeeds.
|
||||
|
||||
## Retention And Cleanup
|
||||
|
||||
Local files are kept by default, even after upload verification.
|
||||
|
||||
Cleanup requires an explicit flag:
|
||||
|
||||
```sh
|
||||
/opt/orderbooks/scripts/upload_archive_rclone.sh \
|
||||
--execute \
|
||||
--cleanup-after-verify \
|
||||
--retention-days 7 \
|
||||
--data-dir /var/lib/orderbooks \
|
||||
--dest "$ORDERBOOKS_RCLONE_DEST"
|
||||
```
|
||||
|
||||
Cleanup deletes only files that were selected for upload, uploaded, verified, and
|
||||
older than the retention window. The default retention window is 7 days.
|
||||
|
||||
## Upload Manifest
|
||||
|
||||
Each run writes a manifest such as:
|
||||
|
||||
```text
|
||||
/var/lib/orderbooks/manifests/upload_archive_YYYYMMDDTHHMMSSZ.json
|
||||
```
|
||||
|
||||
The manifest records:
|
||||
|
||||
- planned files
|
||||
- attempted files
|
||||
- dry-run files
|
||||
- uploaded files
|
||||
- verified files
|
||||
- skipped open or recent files
|
||||
- retained local files
|
||||
- deleted local files
|
||||
- SHA-256 checksums
|
||||
- command mode
|
||||
- start/end time
|
||||
- rclone copy/check exit codes
|
||||
- gate status
|
||||
|
||||
For this repository, the sample manifest path is:
|
||||
|
||||
```text
|
||||
data/manifests/upload_archive_sample_manifest.json
|
||||
```
|
||||
|
||||
The verified Checkpoint 7 real-test manifest is:
|
||||
|
||||
```text
|
||||
data/manifests/upload_archive_real_test_manifest.json
|
||||
```
|
||||
|
||||
## systemd Timer
|
||||
|
||||
Install the unit files:
|
||||
|
||||
```sh
|
||||
sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.service /etc/systemd/system/polymarket-orderbook-uploader.service
|
||||
sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.timer /etc/systemd/system/polymarket-orderbook-uploader.timer
|
||||
sudo systemctl daemon-reload
|
||||
```
|
||||
|
||||
Create the environment file:
|
||||
|
||||
```sh
|
||||
sudo install -o root -g orderbooks -m 0640 /dev/null /etc/orderbooks/orderbook-uploader.env
|
||||
sudo editor /etc/orderbooks/orderbook-uploader.env
|
||||
```
|
||||
|
||||
At minimum, set:
|
||||
|
||||
```text
|
||||
ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
|
||||
```
|
||||
|
||||
Enable the timer:
|
||||
|
||||
```sh
|
||||
sudo systemctl enable --now polymarket-orderbook-uploader.timer
|
||||
```
|
||||
|
||||
Run one upload immediately:
|
||||
|
||||
```sh
|
||||
sudo systemctl start polymarket-orderbook-uploader.service
|
||||
```
|
||||
|
||||
## Logs
|
||||
|
||||
Use the systemd journal:
|
||||
|
||||
```sh
|
||||
sudo systemctl status polymarket-orderbook-uploader.service
|
||||
sudo journalctl -u polymarket-orderbook-uploader.service -f
|
||||
sudo systemctl list-timers polymarket-orderbook-uploader.timer
|
||||
```
|
||||
|
||||
## Current Checkpoint 7 Result
|
||||
|
||||
Initial local validation was blocked when `rclone` was unavailable. That blocked
|
||||
manifest remains at:
|
||||
|
||||
```text
|
||||
data/manifests/upload_archive_sample_manifest.json
|
||||
```
|
||||
|
||||
After `rclone` was configured as `/usr/bin/rclone` with remote `gdrive:`, a dry
|
||||
run and one tiny real upload were run against:
|
||||
|
||||
```text
|
||||
gdrive:orderbooks/polymarket/checkpoint7-test
|
||||
```
|
||||
|
||||
The real upload manifest records `rclone copy` exit code 0 and `rclone check`
|
||||
exit code 0:
|
||||
|
||||
```text
|
||||
data/manifests/upload_archive_real_test_manifest.json
|
||||
```
|
||||
|
||||
Current gate:
|
||||
|
||||
```text
|
||||
PASS
|
||||
```
|
||||
|
||||
## What Remains Unproven
|
||||
|
||||
- Long-run upload reliability.
|
||||
- Interaction between hourly uploads and a 24h collector soak test.
|
||||
- Retention cleanup after verified upload.
|
||||
- Production readiness.
|
||||
148
docs/KUBERNETES_DEPLOYMENT.md
Normal file
148
docs/KUBERNETES_DEPLOYMENT.md
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
# Kubernetes Deployment
|
||||
|
||||
Status: draft runtime package for Checkpoint 8G
|
||||
|
||||
This document describes the Kubernetes package for the Polymarket raw
|
||||
order-book collector. It follows the shared Hetzner k3s cluster model from
|
||||
`../nuri/unrip3`: application code, Dockerfile, manifests, and Forgejo workflow
|
||||
live in this repository; platform services, the shared registry, and the shared
|
||||
Forgejo runner remain platform-owned.
|
||||
|
||||
This package does not claim production readiness. Production readiness still
|
||||
requires a real Kubernetes runtime smoke run with preserved evidence.
|
||||
|
||||
## Cluster Decisions
|
||||
|
||||
- Namespace: `orderbooks`
|
||||
- Workstation kubeconfig for validation: `../nuri/unrip3/.state/hetzner/kubeconfig.yaml`
|
||||
- Shared registry and shared Forgejo runner
|
||||
- Existing rclone Secret: `orderbooks/orderbooks-rclone-config`
|
||||
- Secret key mounted by the uploader: `rclone.conf`
|
||||
|
||||
Do not commit or print rclone config contents.
|
||||
|
||||
## Runtime Layout
|
||||
|
||||
The collector and uploader share one PVC:
|
||||
|
||||
```text
|
||||
PVC: orderbooks-data
|
||||
mount: /var/lib/orderbooks
|
||||
raw files: /var/lib/orderbooks/raw_orderbooks
|
||||
manifests: /var/lib/orderbooks/manifests
|
||||
discovery: /var/lib/orderbooks/discovery
|
||||
```
|
||||
|
||||
The collector uses one Deployment with one replica. The container runs
|
||||
`/app/scripts/run_polymarket_collector_loop.sh`, which repeatedly executes the
|
||||
existing bounded collector cycle and records loop failure/interruption manifests
|
||||
instead of relying on Kubernetes crash loops for normal operation.
|
||||
|
||||
The uploader uses one CronJob. It runs the existing rclone uploader in execute
|
||||
mode, mounts the same PVC, mounts `orderbooks-rclone-config` read-only at
|
||||
`/etc/rclone/rclone.conf`, sets `RCLONE_CONFIG` to that file, and uploads only
|
||||
closed/aged files.
|
||||
|
||||
|
||||
## Bootstrap This App Repo
|
||||
|
||||
Run the orderbooks-specific bootstrap from this repository:
|
||||
|
||||
```sh
|
||||
scripts/deploy/bootstrap_orderbooks_k8s.sh
|
||||
```
|
||||
|
||||
The bootstrap loads platform defaults and resolved secrets from the local
|
||||
platform state without printing secret values. It ensures namespace `orderbooks`,
|
||||
creates or updates `orderbooks-registry-creds`, verifies the existing
|
||||
`orderbooks-rclone-config` secret has key `rclone.conf`, creates or updates the
|
||||
Forgejo repo `philipp/orderbooks`, and upserts the required Actions secret and
|
||||
variables.
|
||||
|
||||
After bootstrap, push a clean source tree to Forgejo `main`. Do not push local
|
||||
`data/`, `artifacts/`, `reports/`, `orchestration/`, kubeconfigs, rclone config,
|
||||
`.env`, private keys, or other local evidence/secrets.
|
||||
|
||||
## Image Build And Deploy
|
||||
|
||||
The Forgejo workflow is `.forgejo/workflows/deploy.yml`. It follows the shared
|
||||
runner pattern:
|
||||
|
||||
1. load `KUBECONFIG_B64` from Forgejo secrets;
|
||||
2. clone this repo inside the runner;
|
||||
3. create an in-cluster Kaniko Job;
|
||||
4. build and push `REGISTRY_HOST/orderbooks:<git-sha>`;
|
||||
5. apply `deploy/k8s/base` with the built image;
|
||||
6. wait for `deployment/orderbooks-collector` rollout.
|
||||
|
||||
Required Forgejo repo secret:
|
||||
|
||||
```text
|
||||
KUBECONFIG_B64
|
||||
```
|
||||
|
||||
Required Forgejo repo variable:
|
||||
|
||||
```text
|
||||
REGISTRY_HOST
|
||||
```
|
||||
|
||||
Project defaults used by the workflow:
|
||||
|
||||
```text
|
||||
PROJECT_NAME=orderbooks
|
||||
PROJECT_NAMESPACE=orderbooks
|
||||
PROJECT_DEPLOYMENTS=orderbooks-collector
|
||||
PROJECT_REGISTRY_SECRET_NAME=orderbooks-registry-creds
|
||||
```
|
||||
|
||||
The registry pull/build secret `orderbooks-registry-creds` must exist in the
|
||||
`orderbooks` namespace before the workflow builds and deploys.
|
||||
|
||||
## Pre-Deploy Validation
|
||||
|
||||
From this repository:
|
||||
|
||||
```sh
|
||||
bash -n scripts/run_polymarket_collector_loop.sh
|
||||
bash -n scripts/k8s_runtime_smoke_check.sh
|
||||
kubectl kustomize deploy/k8s/base
|
||||
KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml kubectl apply -k deploy/k8s/base --dry-run=server
|
||||
KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml kubectl -n orderbooks get secret orderbooks-rclone-config -o go-template='{{if index .data "rclone.conf"}}rclone_secret_key_present{{else}}rclone_secret_key_missing{{end}}{{"\n"}}'
|
||||
```
|
||||
|
||||
The last command checks only whether the key exists. It must not print secret
|
||||
data.
|
||||
|
||||
## Runtime Smoke Gate
|
||||
|
||||
After the image is built and the workload is actually deployed, run:
|
||||
|
||||
```sh
|
||||
KUBECONFIG=../nuri/unrip3/.state/hetzner/kubeconfig.yaml scripts/k8s_runtime_smoke_check.sh --namespace orderbooks --deployment orderbooks-collector --cronjob orderbooks-uploader --raw-dir /var/lib/orderbooks/raw_orderbooks --manifest-dir /var/lib/orderbooks/manifests --wait-seconds 1800 \
|
||||
--upload-min-age-seconds 600
|
||||
```
|
||||
|
||||
The smoke gate uses `kubectl`, not systemd. It writes local JSON evidence under
|
||||
`data/manifests/k8s_runtime_smoke_<UTC_TIMESTAMP>.json` by default. It verifies:
|
||||
|
||||
- collector pod is running;
|
||||
- latest collector manifest has `gate_status: PASS`, `rows_written > 0`, and
|
||||
`failure_count: 0`;
|
||||
- raw gzip JSONL parses and is under `/var/lib/orderbooks/raw_orderbooks`;
|
||||
- deleting the collector pod does not corrupt the old raw file checksum or row
|
||||
count;
|
||||
- a later post-restart collector cycle writes valid rows;
|
||||
- an uploader Job created from the CronJob completes;
|
||||
- the latest upload manifest records a verified rclone upload with at least one
|
||||
verified file.
|
||||
|
||||
A failed smoke run still writes JSON evidence and exits nonzero. Preserve failed
|
||||
manifests, raw files, upload manifests, and pod logs for review.
|
||||
|
||||
## Not Included
|
||||
|
||||
- No trading, signing, wallets, private keys, or API keys.
|
||||
- No dashboard, database, strategy, backtest, or second-market connector.
|
||||
- No websocket rewrite.
|
||||
- No rclone config contents in this repository.
|
||||
104
docs/METHODOLOGY.md
Normal file
104
docs/METHODOLOGY.md
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
# Methodology
|
||||
|
||||
This project uses checkpoint-driven compound engineering. The point is to preserve useful data and operational learning, not to accumulate scaffolding.
|
||||
|
||||
## Checkpoint Cycle
|
||||
|
||||
Every checkpoint follows the same loop:
|
||||
|
||||
1. Define the smallest useful checkpoint.
|
||||
2. Build only what is required for that checkpoint.
|
||||
3. Validate with real commands and real data when applicable.
|
||||
4. Write durable artifacts: code or docs, config or run instructions, manifest/report, and validation evidence.
|
||||
5. State `PASS`, `FAIL`, or `BLOCKED`.
|
||||
6. Identify the strongest fake-progress risk.
|
||||
7. Recommend the next smallest step.
|
||||
8. Stop and ask only when a real decision is needed.
|
||||
|
||||
## Gate States
|
||||
|
||||
- `PASS`: the checkpoint pass condition is met and evidence is on disk.
|
||||
- `FAIL`: the checkpoint was attempted but did not meet its pass condition.
|
||||
- `BLOCKED`: work cannot continue without a decision, credential, service, or unavailable dependency.
|
||||
- `PARTIAL`: useful artifacts exist, but the checkpoint should not be treated as passed.
|
||||
|
||||
## Evidence Rules
|
||||
|
||||
- Evidence must be reproducible from files and commands, not just chat.
|
||||
- If a command was used to validate behavior, record the command and summarize the result in a report or manifest.
|
||||
- If data was collected, preserve raw data and include checksums.
|
||||
- If synthetic or sample data is used, label it explicitly.
|
||||
- If a claim depends on a public endpoint, record the endpoint, request parameters, response fields, status codes, timestamps, and fetch time.
|
||||
- Do not claim reliability from a short sample run. Reliability requires the roadmap soak test.
|
||||
|
||||
## Machine-Readable Manifest Format
|
||||
|
||||
Checkpoint manifests should be JSON and stored under `data/manifests/`. Use this shape unless a later checkpoint documents a better schema:
|
||||
|
||||
```json
|
||||
{
|
||||
"checkpoint_id": 1,
|
||||
"checkpoint_name": "Project Scaffold And Methodology",
|
||||
"status": "PASS",
|
||||
"started_at_utc": "2026-04-14T20:53:49Z",
|
||||
"ended_at_utc": "2026-04-14T20:53:49Z",
|
||||
"scope": "Durable project rules and roadmap only; no collector implementation.",
|
||||
"artifacts": [
|
||||
{
|
||||
"path": "AGENTS.md",
|
||||
"kind": "project_rules",
|
||||
"status": "valid"
|
||||
}
|
||||
],
|
||||
"validation": {
|
||||
"commands": [
|
||||
{
|
||||
"command": "git status --short",
|
||||
"result": "completed"
|
||||
}
|
||||
],
|
||||
"summary": "Required files exist and contain checkpoint rules."
|
||||
},
|
||||
"decisions": [],
|
||||
"assumptions": [],
|
||||
"fake_progress_risk": "Most progress is documentation until public Polymarket endpoint behavior is proven.",
|
||||
"next_step": "Run Checkpoint 2 public source probe."
|
||||
}
|
||||
```
|
||||
|
||||
## Markdown Checkpoint Report Format
|
||||
|
||||
Checkpoint reports should be stored under `reports/checkpoints/` and include:
|
||||
|
||||
- active checkpoint
|
||||
- scope
|
||||
- files created or changed
|
||||
- validation commands and results
|
||||
- project rules or operational lessons added
|
||||
- pass/fail/gate
|
||||
- strongest fake-progress risk
|
||||
- next smallest step
|
||||
|
||||
## Deprecated Or Misleading Artifacts
|
||||
|
||||
Do not delete mistakes. Preserve the original artifact and label it.
|
||||
|
||||
Preferred labels:
|
||||
|
||||
- Add a manifest entry with `status: "deprecated"` or `status: "invalid"`.
|
||||
- Add a sibling note named `<artifact>.deprecated.md` or `<artifact>.invalid.md` when a human explanation is useful.
|
||||
- Include why the artifact is wrong, when it was labeled, who labeled it, and what replaces it.
|
||||
|
||||
If an artifact is dangerous because it contains secrets, stop and ask the user. Do not spread or copy the secret into reports.
|
||||
|
||||
## Anti-Fake-Progress Rules
|
||||
|
||||
- No dashboard before 24h data reliability.
|
||||
- No database before plain compressed files become painful.
|
||||
- No strategy, backtest, optimizer, or trading bot code.
|
||||
- No private-key or signing code.
|
||||
- No generic multi-market abstraction before a second market exists.
|
||||
- No "production-ready" claim before a 24h soak test.
|
||||
- No endpoint assumptions without probe evidence.
|
||||
- No normalized dataset that cannot trace back to raw records.
|
||||
|
||||
93
docs/OPERATIONS.md
Normal file
93
docs/OPERATIONS.md
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
# Operations
|
||||
|
||||
This document defines operational rules before the collector exists. It should be updated with exact commands as checkpoints add scripts, services, and upload jobs.
|
||||
|
||||
## Current Operational Status
|
||||
|
||||
- Collector implementation: not started.
|
||||
- Supported market: none yet; Polymarket is the first planned market.
|
||||
- Deployment target: small VPS.
|
||||
- Offload target: Google Drive through `rclone`.
|
||||
- Reliability status: not production-ready until a documented 24h soak test passes.
|
||||
|
||||
## Safety Rules
|
||||
|
||||
- No trading.
|
||||
- No order placement.
|
||||
- No wallet signing.
|
||||
- No private keys.
|
||||
- No secrets in git.
|
||||
- No dashboards, databases, ML, or strategy code before the roadmap gate allows them.
|
||||
|
||||
## Local Runtime Principles
|
||||
|
||||
Future scripts should:
|
||||
|
||||
- accept a configurable data directory
|
||||
- write logs to a predictable location
|
||||
- write raw gzip JSONL snapshots
|
||||
- rotate files by hour or run
|
||||
- close files cleanly on shutdown
|
||||
- write manifests after runs
|
||||
- avoid corrupting closed files on restart
|
||||
- handle public endpoint errors and rate limits conservatively
|
||||
|
||||
## VPS Deployment Principles
|
||||
|
||||
Checkpoint 6 should document:
|
||||
|
||||
- Python version and virtualenv setup
|
||||
- package installation
|
||||
- environment variables
|
||||
- systemd or Docker Compose runtime
|
||||
- service user and file permissions
|
||||
- data directory ownership
|
||||
- log locations
|
||||
- restart policy
|
||||
- disk usage checks
|
||||
- safe upgrade and rollback steps
|
||||
|
||||
## Google Drive Offload Principles
|
||||
|
||||
Checkpoint 7 should use `rclone` and must:
|
||||
|
||||
- avoid hardcoded credentials
|
||||
- upload only closed or rotated files
|
||||
- support dry-run mode
|
||||
- verify upload success
|
||||
- preserve local files until upload is verified
|
||||
- maintain checksums
|
||||
- keep the last N days locally
|
||||
- write an upload manifest
|
||||
|
||||
## Incident And Bad-Data Handling
|
||||
|
||||
If data looks wrong:
|
||||
|
||||
1. Preserve the raw files.
|
||||
2. Stop relying on the affected derived files.
|
||||
3. Label the artifact `invalid` or `deprecated`.
|
||||
4. Write a short note explaining the issue and replacement, if any.
|
||||
5. Keep the learning in docs or reports.
|
||||
|
||||
Examples of bad-data conditions:
|
||||
|
||||
- endpoint returned a schema different from expected
|
||||
- token/outcome mapping was wrong
|
||||
- timestamps were misunderstood
|
||||
- rate limits caused large gaps
|
||||
- gzip file was not closed cleanly
|
||||
- upload succeeded but checksum did not match
|
||||
|
||||
## Minimum Reliability Claim
|
||||
|
||||
A short sample run can prove that code writes files. It cannot prove 24/7 reliability.
|
||||
|
||||
The project may only claim production readiness after:
|
||||
|
||||
- discovery works
|
||||
- raw order-book collection works
|
||||
- offload works
|
||||
- 24h soak test completes
|
||||
- data quality and gap metrics are documented
|
||||
|
||||
102
docs/ORDERBOOK_SCHEMA.md
Normal file
102
docs/ORDERBOOK_SCHEMA.md
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
# Orderbook Snapshot Schema
|
||||
|
||||
Status: valid
|
||||
|
||||
This document covers the Checkpoint 5 normalized order-book sample. The raw
|
||||
gzip JSONL files remain the source of truth. Normalized rows are derived records
|
||||
for quick inspection and later quality checks.
|
||||
|
||||
## Normalized Snapshot
|
||||
|
||||
Schema name: `normalized_orderbook_snapshot`
|
||||
|
||||
Schema version: `1`
|
||||
|
||||
File format: gzip JSONL, one JSON object per line.
|
||||
|
||||
Sample location:
|
||||
|
||||
```text
|
||||
data/normalized_sample/polymarket/orderbooks/<run_id>/polymarket_orderbooks_normalized_<run_id>.jsonl.gz
|
||||
```
|
||||
|
||||
Every normalized row must reference exactly one raw gzip JSONL source row:
|
||||
|
||||
- `raw_file`: repository-relative path to the raw gzip JSONL file.
|
||||
- `raw_line_number`: 1-based line number inside that raw gzip JSONL file.
|
||||
|
||||
Derived data is invalid if either lineage field is missing or points to a
|
||||
missing raw file.
|
||||
|
||||
## Field Contract
|
||||
|
||||
| Field | Type | Meaning |
|
||||
| --- | --- | --- |
|
||||
| `schema_name` | string | Always `normalized_orderbook_snapshot`. |
|
||||
| `schema_version` | number | Schema version, currently `1`. |
|
||||
| `market_name` | string | Market source name from the raw envelope. |
|
||||
| `market_slug` | string | Polymarket market slug from the raw envelope. |
|
||||
| `condition_id` | string | Polymarket condition ID from the raw envelope. |
|
||||
| `token_id` | string | Polymarket CLOB token ID from the raw envelope. |
|
||||
| `outcome` | string | Outcome label associated with `token_id`. |
|
||||
| `collected_at_utc` | string | Collector timestamp from the raw envelope. |
|
||||
| `best_bid` | string or null | Maximum bid price, or null when no bids exist. |
|
||||
| `best_ask` | string or null | Minimum ask price, or null when no asks exist. |
|
||||
| `spread` | string or null | `best_ask - best_bid` when both sides exist. |
|
||||
| `midpoint` | string or null | `(best_bid + best_ask) / 2` when both sides exist. |
|
||||
| `bid_depth_total` | string | Sum of all bid sizes. |
|
||||
| `ask_depth_total` | string | Sum of all ask sizes. |
|
||||
| `bid_depth_within_1c` | string | Sum of bid sizes priced at least `best_bid - 0.01`. |
|
||||
| `ask_depth_within_1c` | string | Sum of ask sizes priced at most `best_ask + 0.01`. |
|
||||
| `bid_depth_within_2c` | string | Sum of bid sizes priced at least `best_bid - 0.02`. |
|
||||
| `ask_depth_within_2c` | string | Sum of ask sizes priced at most `best_ask + 0.02`. |
|
||||
| `bid_depth_within_5c` | string | Sum of bid sizes priced at least `best_bid - 0.05`. |
|
||||
| `ask_depth_within_5c` | string | Sum of ask sizes priced at most `best_ask + 0.05`. |
|
||||
| `raw_file` | string | Repository-relative raw gzip JSONL path. |
|
||||
| `raw_line_number` | number | 1-based source line number in `raw_file`. |
|
||||
|
||||
## Numeric Encoding
|
||||
|
||||
Prices and sizes are parsed with Python `Decimal`. Derived numeric values are
|
||||
emitted as exact decimal strings rather than JSON numbers. This keeps precision
|
||||
visible and avoids binary floating-point rounding.
|
||||
|
||||
Missing price-derived values are emitted as `null`. Depth totals and depth bands
|
||||
are emitted as decimal strings and use `"0"` when the relevant side is empty.
|
||||
|
||||
## Calculation Rules
|
||||
|
||||
- `best_bid`: maximum bid price.
|
||||
- `best_ask`: minimum ask price.
|
||||
- `spread`: `best_ask - best_bid` when both sides exist.
|
||||
- `midpoint`: `(best_bid + best_ask) / 2` when both sides exist.
|
||||
- `bid_depth_total`: sum of all bid sizes.
|
||||
- `ask_depth_total`: sum of all ask sizes.
|
||||
- `bid_depth_within_1c`: sum bid sizes with price greater than or equal to
|
||||
`best_bid - 0.01`.
|
||||
- `ask_depth_within_1c`: sum ask sizes with price less than or equal to
|
||||
`best_ask + 0.01`.
|
||||
- The same band rule is used for `0.02` and `0.05`.
|
||||
|
||||
## Sanity Rules
|
||||
|
||||
A normalized file should pass these checks:
|
||||
|
||||
- Output row count equals raw input row count unless skipped rows are recorded.
|
||||
- Every row has `raw_file` and `raw_line_number`.
|
||||
- Every referenced raw file exists.
|
||||
- `spread` is non-negative whenever both sides exist.
|
||||
- `midpoint` is between `best_bid` and `best_ask` whenever both sides exist.
|
||||
- Depth totals and band depths are non-negative.
|
||||
- At least one `Up` row and one `Down` row exist in the sample.
|
||||
- The gzip JSONL file decompresses and every line parses as JSON.
|
||||
- The manifest checksum matches the normalized output file.
|
||||
|
||||
## Current Known Gaps
|
||||
|
||||
- This schema covers a derived sample extract only.
|
||||
- It does not define sustained daily normalized partitions.
|
||||
- It does not include upload, daemon runtime, dashboards, databases, strategy
|
||||
code, backtests, trading behavior, or wallet behavior.
|
||||
- Long-run schema stability still depends on future collection and soak-test
|
||||
evidence.
|
||||
149
docs/POLYMARKET_COLLECTOR.md
Normal file
149
docs/POLYMARKET_COLLECTOR.md
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
# Polymarket Collector
|
||||
|
||||
Artifact status: `valid`
|
||||
|
||||
## Scope
|
||||
|
||||
This document covers the Checkpoint 4 bounded raw order-book sample collector.
|
||||
|
||||
It does not describe a production service. It does not include normalization, upload, systemd, dashboards, databases, strategies, trading, wallet logic, private keys, API keys, or private endpoints.
|
||||
|
||||
## Inputs
|
||||
|
||||
The collector reads active BTC markets from:
|
||||
|
||||
```text
|
||||
data/discovery/polymarket_btc_markets_latest.json
|
||||
```
|
||||
|
||||
Checkpoint 3 writes normalized market records with `condition_id` and `tokens` preserving the `Up` and `Down` outcome-token mapping. The collector uses only those records and does not perform market discovery itself.
|
||||
|
||||
If the discovery file is stale or contains no usable active markets, run:
|
||||
|
||||
```sh
|
||||
python3 scripts/discover_polymarket_btc_markets.py
|
||||
```
|
||||
|
||||
## Endpoint
|
||||
|
||||
The sample uses the public CLOB batch order-book endpoint:
|
||||
|
||||
```text
|
||||
POST https://clob.polymarket.com/books
|
||||
```
|
||||
|
||||
Request body shape:
|
||||
|
||||
```json
|
||||
[
|
||||
{"token_id": "<up_token_id>"},
|
||||
{"token_id": "<down_token_id>"}
|
||||
]
|
||||
```
|
||||
|
||||
No authentication is used.
|
||||
|
||||
## Running A Bounded Sample
|
||||
|
||||
Default sample command:
|
||||
|
||||
```sh
|
||||
python3 scripts/collect_polymarket_orderbooks.py
|
||||
```
|
||||
|
||||
The default config is:
|
||||
|
||||
```text
|
||||
config/polymarket_collector.example.yaml
|
||||
```
|
||||
|
||||
The example config is deliberately small:
|
||||
|
||||
- `market_limit: 2`
|
||||
- `interval_seconds: 30`
|
||||
- `duration_seconds: 300`
|
||||
- `market_end_safety_seconds: 420`
|
||||
|
||||
This produces a 5-minute sample for at most 2 markets, fetching both `Up` and `Down` outcome tokens by batch request.
|
||||
|
||||
## Outputs
|
||||
|
||||
Raw gzip JSONL snapshots are written under:
|
||||
|
||||
```text
|
||||
data/live_sample/polymarket/orderbooks/<run_id>/
|
||||
```
|
||||
|
||||
The sample manifest is written to:
|
||||
|
||||
```text
|
||||
data/manifests/orderbook_collector_sample_manifest.json
|
||||
```
|
||||
|
||||
Files rotate by run for this checkpoint. Hourly rotation is intentionally left for a later sustained runtime checkpoint.
|
||||
|
||||
## Raw JSONL Envelope
|
||||
|
||||
Each gzip JSONL line is a raw-first envelope:
|
||||
|
||||
```json
|
||||
{
|
||||
"schema_name": "raw_orderbook_snapshot",
|
||||
"schema_version": 1,
|
||||
"collector": {
|
||||
"name": "polymarket_orderbook_collector",
|
||||
"version": "0.1.0"
|
||||
},
|
||||
"market": {
|
||||
"market_name": "polymarket",
|
||||
"market_slug": "example",
|
||||
"condition_id": "0x...",
|
||||
"token_id": "123",
|
||||
"outcome": "Up",
|
||||
"market_end_time_utc": "2026-04-14T22:00:00Z"
|
||||
},
|
||||
"collection": {
|
||||
"collected_at_utc": "2026-04-14T21:00:00Z",
|
||||
"sequence": 1,
|
||||
"response_index": 0
|
||||
},
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"url": "https://clob.polymarket.com/books",
|
||||
"params": null,
|
||||
"json_body": [{"token_id": "123"}],
|
||||
"status_code": 200,
|
||||
"duration_ms": 123,
|
||||
"attempts": []
|
||||
},
|
||||
"raw": {}
|
||||
}
|
||||
```
|
||||
|
||||
The `raw` object is the unmodified order-book object returned by CLOB for that token.
|
||||
|
||||
## Rate-Limit Handling
|
||||
|
||||
The sample is conservative:
|
||||
|
||||
- Uses a small market cap by default.
|
||||
- Uses a fixed interval between batch requests.
|
||||
- Applies request timeout.
|
||||
- Retries `429` and `5xx` responses with exponential backoff.
|
||||
- Does not use concurrent requests.
|
||||
|
||||
## Shutdown
|
||||
|
||||
`SIGINT` and `SIGTERM` set a stop flag. The current request, if any, finishes or times out, the gzip file closes, and the manifest is written with a shutdown warning.
|
||||
|
||||
## Known Gaps
|
||||
|
||||
- This is a short run-rotated sample, not a daemon.
|
||||
- It does not prove 24/7 reliability.
|
||||
- It does not implement hourly rotation.
|
||||
- It does not refresh discovery during a run.
|
||||
- It does not normalize snapshots.
|
||||
- It does not upload files.
|
||||
- It does not use websockets.
|
||||
|
||||
The project must not claim production readiness until the later 24h soak test passes with documented quality metrics.
|
||||
54
docs/PRODUCTION_DEFINITION_OF_DONE.md
Normal file
54
docs/PRODUCTION_DEFINITION_OF_DONE.md
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
# Production Definition Of Done
|
||||
|
||||
Status: ACTIVE
|
||||
|
||||
Defined at UTC: 2026-04-17T09:12:02Z
|
||||
|
||||
This project is done for the first production milestone only when it is reliably
|
||||
collecting Polymarket BTC order-book data on a small VPS with evidence on disk.
|
||||
Packaging, docs, local samples, and local soak tests are useful evidence, but
|
||||
not the finish line.
|
||||
|
||||
## Done Means
|
||||
|
||||
1. The collector runs on the VPS under systemd using `/opt/orderbooks` for code
|
||||
and `/var/lib/orderbooks` for data.
|
||||
2. Raw gzip JSONL order-book snapshots are written for active BTC up/down
|
||||
markets, with manifests beside them.
|
||||
3. The service survives a forced restart: after restart, a later collection
|
||||
cycle writes valid raw rows without corrupting prior files.
|
||||
4. Temporary network/API failure is handled as an operational failure, not data
|
||||
loss: failures are visible in logs/manifests, and the next successful cycle
|
||||
resumes writing new files.
|
||||
5. Google Drive upload runs from the VPS through `rclone`, verifies success, and
|
||||
leaves local files in place until upload is confirmed.
|
||||
6. A final production report and machine-readable manifest record exact commands,
|
||||
timestamps, files, checksums, restart result, upload result, and remaining
|
||||
risks.
|
||||
|
||||
## Not Required For This Milestone
|
||||
|
||||
- No second market.
|
||||
- No dashboard.
|
||||
- No database.
|
||||
- No strategy or backtest code.
|
||||
- No websocket rewrite unless polling proves insufficient.
|
||||
- No generic multi-market abstraction.
|
||||
|
||||
## Maximum Remaining Builder Turns
|
||||
|
||||
The remaining work is capped at three builder turns:
|
||||
|
||||
1. Accept deploy bundle and prepare the minimal VPS reliability gate.
|
||||
2. Execute or guide the VPS cutover and collect runtime evidence.
|
||||
3. Fix only blocking production issues found by the VPS gate, then write the
|
||||
final pass/fail report.
|
||||
|
||||
If actual VPS access is unavailable, the gate must be `BLOCKED_NEEDS_VPS_ACCESS`,
|
||||
not production ready.
|
||||
|
||||
## Current Evidence
|
||||
|
||||
- Deploy bundle gate: `DEPLOY_BUNDLE_READY`.
|
||||
- Local 24h soak final manifest exists but remains `NEEDS_REVIEW`.
|
||||
- Production readiness remains false until VPS runtime evidence exists.
|
||||
341
docs/VPS_CUTOVER_RUNBOOK.md
Normal file
341
docs/VPS_CUTOVER_RUNBOOK.md
Normal file
|
|
@ -0,0 +1,341 @@
|
|||
# VPS Cutover Runbook
|
||||
|
||||
Status: valid
|
||||
|
||||
Checkpoint 8 status is `WAIVED_BY_USER`, not `PASS`. This runbook prepares a
|
||||
VPS cutover for the existing Polymarket raw order-book collector only. It does
|
||||
not claim production readiness, second-market support, dashboards, databases,
|
||||
strategies, or trading.
|
||||
|
||||
## Scope
|
||||
|
||||
Included:
|
||||
|
||||
- VPS prerequisite checks.
|
||||
- Repository copy/update steps.
|
||||
- Public Polymarket collector service install.
|
||||
- Google Drive offload timer install with rclone.
|
||||
- Liveness, cycle health, and upload verification commands.
|
||||
- Rollback and stop commands.
|
||||
|
||||
Excluded:
|
||||
|
||||
- Private API access.
|
||||
- Wallets, keys, mnemonics, signing, order placement, or trading.
|
||||
- Database, dashboard, strategy, or second-market work.
|
||||
|
||||
## Recommended VPS Layout
|
||||
|
||||
Use the existing package paths unless the VPS has a reason to differ:
|
||||
|
||||
```text
|
||||
repository: /opt/orderbooks
|
||||
python virtualenv: /opt/orderbooks/.venv
|
||||
config: /etc/orderbooks/polymarket_collector.vps.yaml
|
||||
collector env: /etc/orderbooks/polymarket-orderbook-collector.env
|
||||
uploader env: /etc/orderbooks/orderbook-uploader.env
|
||||
data root: /var/lib/orderbooks
|
||||
raw files: /var/lib/orderbooks/raw_orderbooks
|
||||
manifests: /var/lib/orderbooks/manifests
|
||||
discovery: /var/lib/orderbooks/discovery
|
||||
```
|
||||
|
||||
The `orderbooks` system user should own `/var/lib/orderbooks`. The repository
|
||||
under `/opt/orderbooks` can be root-owned and world-readable.
|
||||
|
||||
## VPS Prerequisites
|
||||
|
||||
On Ubuntu or Debian:
|
||||
|
||||
```sh
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y git python3 python3-venv rclone
|
||||
sudo useradd --system --home /var/lib/orderbooks --shell /usr/sbin/nologin orderbooks || true
|
||||
sudo mkdir -p /opt /etc/orderbooks /var/lib/orderbooks/discovery /var/lib/orderbooks/raw_orderbooks /var/lib/orderbooks/manifests /var/log/orderbooks
|
||||
sudo chown -R orderbooks:orderbooks /var/lib/orderbooks /var/log/orderbooks
|
||||
```
|
||||
|
||||
No API keys, private keys, mnemonics, wallets, or trading credentials are
|
||||
required by this project. rclone credentials are the only machine-local
|
||||
credential material expected for Google Drive offload, and they must stay
|
||||
outside the repository.
|
||||
|
||||
## Copy Or Update The Repository
|
||||
|
||||
First install:
|
||||
|
||||
```sh
|
||||
cd /opt
|
||||
sudo git clone <repo-url> orderbooks
|
||||
```
|
||||
|
||||
Update an existing checkout:
|
||||
|
||||
```sh
|
||||
cd /opt/orderbooks
|
||||
sudo git fetch --all --prune
|
||||
sudo git pull --ff-only
|
||||
```
|
||||
|
||||
Prepare repository permissions and the Python virtualenv:
|
||||
|
||||
```sh
|
||||
cd /opt/orderbooks
|
||||
sudo chmod +x scripts/run_polymarket_collector_cycle.sh scripts/upload_archive_rclone.sh scripts/vps_preflight_check.sh scripts/vps_runtime_smoke_check.sh
|
||||
sudo python3 -m venv .venv
|
||||
sudo .venv/bin/python -m pip install --upgrade pip
|
||||
sudo chown -R root:root /opt/orderbooks
|
||||
sudo chmod -R a+rX /opt/orderbooks
|
||||
```
|
||||
|
||||
The current collector scripts use the Python standard library.
|
||||
|
||||
## Configure Public Collector Runtime
|
||||
|
||||
Install the example config, then review it:
|
||||
|
||||
```sh
|
||||
sudo install -o root -g root -m 0644 /opt/orderbooks/config/polymarket_collector.vps.example.yaml /etc/orderbooks/polymarket_collector.vps.yaml
|
||||
sudo editor /etc/orderbooks/polymarket_collector.vps.yaml
|
||||
```
|
||||
|
||||
Optional collector env overrides:
|
||||
|
||||
```sh
|
||||
sudo install -o root -g orderbooks -m 0640 /dev/null /etc/orderbooks/polymarket-orderbook-collector.env
|
||||
sudo editor /etc/orderbooks/polymarket-orderbook-collector.env
|
||||
```
|
||||
|
||||
Example values:
|
||||
|
||||
```text
|
||||
ORDERBOOKS_DATA_DIR=/var/lib/orderbooks
|
||||
ORDERBOOKS_OUTPUT_DIR=/var/lib/orderbooks/raw_orderbooks
|
||||
ORDERBOOKS_DISCOVERY_MAX_PAGES=3
|
||||
```
|
||||
|
||||
## Configure Rclone
|
||||
|
||||
Configure rclone as the `orderbooks` user. Do not print or commit
|
||||
`rclone.conf`.
|
||||
|
||||
```sh
|
||||
sudo -u orderbooks rclone config
|
||||
sudo -u orderbooks rclone listremotes
|
||||
sudo -u orderbooks rclone lsf gdrive: --max-depth 1
|
||||
```
|
||||
|
||||
Create the uploader env file:
|
||||
|
||||
```sh
|
||||
sudo install -o root -g orderbooks -m 0640 /dev/null /etc/orderbooks/orderbook-uploader.env
|
||||
sudo editor /etc/orderbooks/orderbook-uploader.env
|
||||
```
|
||||
|
||||
Example:
|
||||
|
||||
```text
|
||||
ORDERBOOKS_RCLONE_DEST=gdrive:orderbooks/polymarket
|
||||
ORDERBOOKS_RCLONE_BIN=/usr/bin/rclone
|
||||
ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600
|
||||
```
|
||||
|
||||
The uploader verifies uploads with `rclone check`. Dry runs do not prove remote
|
||||
write access.
|
||||
|
||||
## Run VPS Preflight
|
||||
|
||||
Run the preflight before installing or starting services:
|
||||
|
||||
```sh
|
||||
cd /opt/orderbooks
|
||||
sudo -u orderbooks /opt/orderbooks/scripts/vps_preflight_check.sh \
|
||||
--app-dir /opt/orderbooks \
|
||||
--python-bin /opt/orderbooks/.venv/bin/python \
|
||||
--rclone-bin /usr/bin/rclone \
|
||||
--rclone-remote gdrive:orderbooks/polymarket \
|
||||
--data-dir /var/lib/orderbooks \
|
||||
--manifest-dir /var/lib/orderbooks/manifests \
|
||||
--log-dir /var/log/orderbooks \
|
||||
--min-free-gib 5
|
||||
```
|
||||
|
||||
The preflight does not print rclone configuration. It checks repository files,
|
||||
Python compilation, shell syntax, systemd unit parsing when available, rclone
|
||||
availability, optional remote readability, target directory writability, disk
|
||||
space, and the absence of required project secrets.
|
||||
|
||||
## Install Systemd Units
|
||||
|
||||
Install collector and uploader units:
|
||||
|
||||
```sh
|
||||
sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-collector.service /etc/systemd/system/polymarket-orderbook-collector.service
|
||||
sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.service /etc/systemd/system/polymarket-orderbook-uploader.service
|
||||
sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-uploader.timer /etc/systemd/system/polymarket-orderbook-uploader.timer
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemd-analyze verify /etc/systemd/system/polymarket-orderbook-collector.service /etc/systemd/system/polymarket-orderbook-uploader.service /etc/systemd/system/polymarket-orderbook-uploader.timer
|
||||
```
|
||||
|
||||
Enable and start:
|
||||
|
||||
```sh
|
||||
sudo systemctl enable --now polymarket-orderbook-collector.service
|
||||
sudo systemctl enable --now polymarket-orderbook-uploader.timer
|
||||
```
|
||||
|
||||
Run one uploader cycle immediately after the collector has produced closed raw
|
||||
files:
|
||||
|
||||
```sh
|
||||
sudo systemctl start polymarket-orderbook-uploader.service
|
||||
```
|
||||
|
||||
Run the minimal runtime reliability smoke gate after both units are installed,
|
||||
rclone is configured, and at least one closed raw file is older than the
|
||||
uploader minimum age (default: 600 seconds):
|
||||
|
||||
```sh
|
||||
sudo /opt/orderbooks/scripts/vps_runtime_smoke_check.sh \
|
||||
--app-dir /opt/orderbooks \
|
||||
--data-dir /var/lib/orderbooks \
|
||||
--raw-dir /var/lib/orderbooks/raw_orderbooks \
|
||||
--manifest-dir /var/lib/orderbooks/manifests \
|
||||
--collector-service polymarket-orderbook-collector.service \
|
||||
--uploader-service polymarket-orderbook-uploader.service \
|
||||
--wait-seconds 900
|
||||
```
|
||||
|
||||
This command is the minimal production reliability gate. It records a JSON
|
||||
evidence manifest under `/var/lib/orderbooks/manifests/`, verifies a valid
|
||||
collector cycle, forces one collector service restart, verifies the prior raw
|
||||
gzip file still parses with the same checksum, waits for a later valid cycle,
|
||||
starts the uploader, and records upload success or failure evidence. Preserve
|
||||
failed smoke manifests and journal logs for review.
|
||||
|
||||
## Check Liveness
|
||||
|
||||
Collector service:
|
||||
|
||||
```sh
|
||||
sudo systemctl status polymarket-orderbook-collector.service
|
||||
sudo journalctl -u polymarket-orderbook-collector.service --since "30 minutes ago"
|
||||
```
|
||||
|
||||
Uploader timer and service:
|
||||
|
||||
```sh
|
||||
sudo systemctl list-timers polymarket-orderbook-uploader.timer
|
||||
sudo systemctl status polymarket-orderbook-uploader.service
|
||||
sudo journalctl -u polymarket-orderbook-uploader.service --since "2 hours ago"
|
||||
```
|
||||
|
||||
Recent artifacts:
|
||||
|
||||
```sh
|
||||
find /var/lib/orderbooks/raw_orderbooks -type f -name '*.jsonl.gz' -printf '%TY-%Tm-%TdT%TH:%TM:%TS %s %p\n' | sort | tail
|
||||
find /var/lib/orderbooks/manifests -type f -name '*.json' -printf '%TY-%Tm-%TdT%TH:%TM:%TS %s %p\n' | sort | tail
|
||||
```
|
||||
|
||||
## Check Latest Cycle Health
|
||||
|
||||
Inspect the newest collector manifest:
|
||||
|
||||
```sh
|
||||
latest_collector="$(find /var/lib/orderbooks/manifests -type f -name 'polymarket_orderbook_collector_*.json' | sort | tail -n 1)"
|
||||
python3 -m json.tool "$latest_collector" | sed -n '1,180p'
|
||||
```
|
||||
|
||||
Minimum healthy signs:
|
||||
|
||||
```text
|
||||
gate_status: PASS
|
||||
rows_written: greater than 0
|
||||
failure_count: 0
|
||||
failures: []
|
||||
```
|
||||
|
||||
Verify the latest raw gzip parses and row count matches its manifest:
|
||||
|
||||
```sh
|
||||
python3 - "$latest_collector" <<'PY'
|
||||
import gzip
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
manifest = json.loads(Path(sys.argv[1]).read_text())
|
||||
for item in manifest.get("output_files", []):
|
||||
path = Path(item["path"])
|
||||
rows = 0
|
||||
with gzip.open(path, "rt", encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
if line.strip():
|
||||
json.loads(line)
|
||||
rows += 1
|
||||
print({"path": str(path), "rows": rows, "manifest_rows": item.get("rows"), "matches": rows == item.get("rows")})
|
||||
PY
|
||||
```
|
||||
|
||||
## Verify Uploads
|
||||
|
||||
Inspect the newest upload manifest:
|
||||
|
||||
```sh
|
||||
latest_upload="$(find /var/lib/orderbooks/manifests -type f -name 'upload_archive_*.json' | sort | tail -n 1)"
|
||||
python3 -m json.tool "$latest_upload" | sed -n '1,220p'
|
||||
```
|
||||
|
||||
Minimum healthy signs:
|
||||
|
||||
```text
|
||||
operation_status: UPLOAD_VERIFIED
|
||||
gate_status: PASS
|
||||
rclone.copy_exit_code: 0
|
||||
rclone.check_exit_code: 0
|
||||
counts.uploaded equals counts.verified
|
||||
```
|
||||
|
||||
Manual remote spot-check without printing config:
|
||||
|
||||
```sh
|
||||
sudo -u orderbooks rclone lsf "$ORDERBOOKS_RCLONE_DEST" --max-depth 2 | head
|
||||
```
|
||||
|
||||
## Rollback Or Stop
|
||||
|
||||
Stop uploader timer first:
|
||||
|
||||
```sh
|
||||
sudo systemctl disable --now polymarket-orderbook-uploader.timer
|
||||
sudo systemctl stop polymarket-orderbook-uploader.service
|
||||
```
|
||||
|
||||
Stop collector:
|
||||
|
||||
```sh
|
||||
sudo systemctl stop polymarket-orderbook-collector.service
|
||||
```
|
||||
|
||||
Disable collector if needed:
|
||||
|
||||
```sh
|
||||
sudo systemctl disable polymarket-orderbook-collector.service
|
||||
```
|
||||
|
||||
Preserve `/var/lib/orderbooks` and `/var/lib/orderbooks/manifests` for evidence.
|
||||
If an artifact is wrong, label it as invalid or deprecated in a sibling note
|
||||
rather than deleting it.
|
||||
|
||||
## Still Not Production Proven
|
||||
|
||||
Because the domestic 24h soak wait was waived by the user, the following remain
|
||||
unproven:
|
||||
|
||||
- A completed 24h collector run with reviewed final metrics.
|
||||
- 24h interaction between collector rotation and uploader timer.
|
||||
- VPS-specific long-run disk, network, rclone, and systemd behavior.
|
||||
- Retention cleanup behavior under verified upload load.
|
||||
|
||||
Treat this as cutover preparation. The VPS is not deployed until the commands
|
||||
are run on the VPS and evidence is written.
|
||||
298
docs/VPS_DEPLOYMENT.md
Normal file
298
docs/VPS_DEPLOYMENT.md
Normal file
|
|
@ -0,0 +1,298 @@
|
|||
# VPS Deployment
|
||||
|
||||
Status: valid
|
||||
|
||||
This document covers the Checkpoint 6 systemd runtime package for the raw
|
||||
Polymarket order-book collector.
|
||||
|
||||
It does not claim production readiness or 24/7 reliability. That remains gated
|
||||
on the later 24h soak test.
|
||||
|
||||
## Scope
|
||||
|
||||
Included:
|
||||
|
||||
- systemd service for the raw collector cycle
|
||||
- Python virtualenv setup
|
||||
- service user and directory permissions
|
||||
- configurable data directory
|
||||
- discovery refresh before each collector cycle
|
||||
- journal-based logs
|
||||
- safe restart model for finite collector runs
|
||||
|
||||
Excluded:
|
||||
|
||||
- Google Drive offload
|
||||
- `rclone`
|
||||
- uploader scripts, services, or timers
|
||||
- normalization changes
|
||||
- dashboards
|
||||
- databases
|
||||
- strategies or backtests
|
||||
- trading, order placement, signing, or wallet logic
|
||||
|
||||
Uploader service and timer units are intentionally deferred to Checkpoint 7.
|
||||
|
||||
## Runtime Model
|
||||
|
||||
The systemd service runs:
|
||||
|
||||
```text
|
||||
/opt/orderbooks/scripts/run_polymarket_collector_cycle.sh
|
||||
```
|
||||
|
||||
Each cycle:
|
||||
|
||||
1. Refreshes BTC market discovery into the configured data directory.
|
||||
2. Runs `scripts/collect_polymarket_orderbooks.py` once.
|
||||
3. Writes run-rotated raw gzip JSONL files.
|
||||
4. Writes a per-cycle collector manifest.
|
||||
5. Exits after the configured finite duration.
|
||||
|
||||
The unit uses `Restart=always`, so systemd starts the next cycle after the prior
|
||||
cycle exits or fails.
|
||||
|
||||
The example config uses a 300 second collection cycle. This is deliberately
|
||||
short because current BTC up/down markets are short-lived and the collector
|
||||
refreshes discovery only before a cycle starts. Do not increase the cycle beyond
|
||||
the practical market horizon unless the collector later learns to refresh market
|
||||
selection during a run.
|
||||
|
||||
## Paths
|
||||
|
||||
Default VPS paths:
|
||||
|
||||
| Purpose | Path |
|
||||
| --- | --- |
|
||||
| Application checkout | `/opt/orderbooks` |
|
||||
| Python virtualenv | `/opt/orderbooks/.venv` |
|
||||
| Service config | `/etc/orderbooks/polymarket_collector.vps.yaml` |
|
||||
| Optional env override file | `/etc/orderbooks/polymarket-orderbook-collector.env` |
|
||||
| Data directory | `/var/lib/orderbooks` |
|
||||
| Discovery artifacts | `/var/lib/orderbooks/discovery` |
|
||||
| Raw order-book output base | `/var/lib/orderbooks/raw_orderbooks` |
|
||||
| Per-cycle manifests | `/var/lib/orderbooks/manifests` |
|
||||
|
||||
Adjust these paths if the repository is installed somewhere other than
|
||||
`/opt/orderbooks`.
|
||||
|
||||
## Environment Variables
|
||||
|
||||
The service defines safe defaults and can load overrides from:
|
||||
|
||||
```text
|
||||
/etc/orderbooks/polymarket-orderbook-collector.env
|
||||
```
|
||||
|
||||
Supported variables:
|
||||
|
||||
| Variable | Default | Meaning |
|
||||
| --- | --- | --- |
|
||||
| `ORDERBOOKS_APP_DIR` | `/opt/orderbooks` | Repository checkout path. |
|
||||
| `ORDERBOOKS_DATA_DIR` | `/var/lib/orderbooks` | Base directory for data files. |
|
||||
| `ORDERBOOKS_PYTHON` | `/opt/orderbooks/.venv/bin/python` | Python interpreter. |
|
||||
| `ORDERBOOKS_COLLECTOR_CONFIG` | `/etc/orderbooks/polymarket_collector.vps.yaml` | Collector config path. |
|
||||
| `ORDERBOOKS_DISCOVERY_DIR` | `$ORDERBOOKS_DATA_DIR/discovery` | Discovery artifact directory. |
|
||||
| `ORDERBOOKS_OUTPUT_DIR` | `$ORDERBOOKS_DATA_DIR/raw_orderbooks` | Collector output base directory. |
|
||||
| `ORDERBOOKS_MANIFEST_DIR` | `$ORDERBOOKS_DATA_DIR/manifests` | Per-cycle manifest directory. |
|
||||
| `ORDERBOOKS_DISCOVERY_LIMIT` | `100` | Gamma event page limit per discovery page. |
|
||||
| `ORDERBOOKS_DISCOVERY_MAX_PAGES` | `3` | Discovery page cap per cycle. |
|
||||
| `ORDERBOOKS_DISCOVERY_TIMEOUT` | `15` | Discovery request timeout in seconds. |
|
||||
|
||||
Example override file:
|
||||
|
||||
```text
|
||||
ORDERBOOKS_DATA_DIR=/var/lib/orderbooks
|
||||
ORDERBOOKS_DISCOVERY_MAX_PAGES=3
|
||||
```
|
||||
|
||||
No API keys are required for this checkpoint.
|
||||
|
||||
## Install On Ubuntu Or Debian
|
||||
|
||||
Run package and account setup as root or with `sudo`:
|
||||
|
||||
```sh
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y git python3 python3-venv
|
||||
sudo useradd --system --home /var/lib/orderbooks --shell /usr/sbin/nologin orderbooks
|
||||
sudo mkdir -p /opt /etc/orderbooks /var/lib/orderbooks/discovery /var/lib/orderbooks/raw_orderbooks /var/lib/orderbooks/manifests
|
||||
```
|
||||
|
||||
Install or update the repository under `/opt/orderbooks`. One option is:
|
||||
|
||||
```sh
|
||||
cd /opt
|
||||
sudo git clone <repo-url> orderbooks
|
||||
```
|
||||
|
||||
If the checkout already exists:
|
||||
|
||||
```sh
|
||||
cd /opt/orderbooks
|
||||
sudo git pull --ff-only
|
||||
```
|
||||
|
||||
Prepare permissions:
|
||||
|
||||
```sh
|
||||
sudo chown -R root:root /opt/orderbooks
|
||||
sudo chmod -R a+rX /opt/orderbooks
|
||||
sudo chmod +x /opt/orderbooks/scripts/run_polymarket_collector_cycle.sh
|
||||
sudo chown -R orderbooks:orderbooks /var/lib/orderbooks
|
||||
```
|
||||
|
||||
Create the virtualenv:
|
||||
|
||||
```sh
|
||||
cd /opt/orderbooks
|
||||
sudo python3 -m venv .venv
|
||||
sudo .venv/bin/python -m pip install --upgrade pip
|
||||
sudo chown -R root:root .venv
|
||||
sudo chmod -R a+rX .venv
|
||||
```
|
||||
|
||||
The current Checkpoint 6 scripts use only the Python standard library.
|
||||
|
||||
Install the VPS config and service unit:
|
||||
|
||||
```sh
|
||||
sudo install -o root -g root -m 0644 /opt/orderbooks/config/polymarket_collector.vps.example.yaml /etc/orderbooks/polymarket_collector.vps.yaml
|
||||
sudo install -o root -g root -m 0644 /opt/orderbooks/systemd/polymarket-orderbook-collector.service /etc/systemd/system/polymarket-orderbook-collector.service
|
||||
```
|
||||
|
||||
Review `/etc/orderbooks/polymarket_collector.vps.yaml` before starting the
|
||||
service. The example writes under `/var/lib/orderbooks`.
|
||||
|
||||
Enable and start:
|
||||
|
||||
```sh
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now polymarket-orderbook-collector.service
|
||||
```
|
||||
|
||||
## Logs And Status
|
||||
|
||||
Use the systemd journal:
|
||||
|
||||
```sh
|
||||
sudo systemctl status polymarket-orderbook-collector.service
|
||||
sudo journalctl -u polymarket-orderbook-collector.service -f
|
||||
```
|
||||
|
||||
Recent logs without following:
|
||||
|
||||
```sh
|
||||
sudo journalctl -u polymarket-orderbook-collector.service --since "1 hour ago"
|
||||
```
|
||||
|
||||
## Output Files
|
||||
|
||||
Raw gzip JSONL files are written under:
|
||||
|
||||
```text
|
||||
/var/lib/orderbooks/raw_orderbooks/polymarket/orderbooks/<run_id>/
|
||||
```
|
||||
|
||||
Per-cycle manifests are written under:
|
||||
|
||||
```text
|
||||
/var/lib/orderbooks/manifests/polymarket_orderbook_collector_<cycle_id>.json
|
||||
```
|
||||
|
||||
Discovery artifacts are refreshed under:
|
||||
|
||||
```text
|
||||
/var/lib/orderbooks/discovery/
|
||||
```
|
||||
|
||||
## Restart And Stop Behavior
|
||||
|
||||
The unit uses:
|
||||
|
||||
```text
|
||||
Restart=always
|
||||
RestartSec=30s
|
||||
TimeoutStopSec=90s
|
||||
KillSignal=SIGTERM
|
||||
KillMode=control-group
|
||||
```
|
||||
|
||||
The collector handles `SIGTERM` by finishing or timing out the current request,
|
||||
closing the gzip output, and writing the manifest. Every cycle writes to a new
|
||||
run directory, so closed files are not reopened by the next cycle.
|
||||
|
||||
Stop the service with:
|
||||
|
||||
```sh
|
||||
sudo systemctl stop polymarket-orderbook-collector.service
|
||||
```
|
||||
|
||||
Start it again with:
|
||||
|
||||
```sh
|
||||
sudo systemctl start polymarket-orderbook-collector.service
|
||||
```
|
||||
|
||||
## Local Validation Without Starting The Service
|
||||
|
||||
These checks do not require root:
|
||||
|
||||
```sh
|
||||
python3 -m py_compile scripts/discover_polymarket_btc_markets.py scripts/collect_polymarket_orderbooks.py
|
||||
bash -n scripts/run_polymarket_collector_cycle.sh
|
||||
python3 - <<'PY'
|
||||
from pathlib import Path
|
||||
from scripts.collect_polymarket_orderbooks import load_flat_yaml
|
||||
cfg = load_flat_yaml(Path('config/polymarket_collector.vps.example.yaml'))
|
||||
required = {
|
||||
'discovery_path',
|
||||
'output_dir',
|
||||
'manifest_path',
|
||||
'market_limit',
|
||||
'interval_seconds',
|
||||
'duration_seconds',
|
||||
}
|
||||
missing = sorted(required - set(cfg))
|
||||
assert not missing, missing
|
||||
assert cfg['duration_seconds'] > 0
|
||||
print('config parse ok')
|
||||
PY
|
||||
```
|
||||
|
||||
If systemd tools are available locally:
|
||||
|
||||
```sh
|
||||
systemd-analyze verify systemd/polymarket-orderbook-collector.service
|
||||
```
|
||||
|
||||
The local machine may not have `/opt/orderbooks` or the `orderbooks` service
|
||||
user. Treat missing VPS path or user messages as deployment-environment warnings,
|
||||
not collector syntax failures.
|
||||
|
||||
## Safe Upgrade
|
||||
|
||||
Stop the service, update files, rerun validation, then start the service:
|
||||
|
||||
```sh
|
||||
sudo systemctl stop polymarket-orderbook-collector.service
|
||||
cd /opt/orderbooks
|
||||
sudo git pull --ff-only
|
||||
sudo .venv/bin/python -m py_compile scripts/discover_polymarket_btc_markets.py scripts/collect_polymarket_orderbooks.py
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl start polymarket-orderbook-collector.service
|
||||
```
|
||||
|
||||
Do not remove existing data files during an upgrade. If a bad artifact is found,
|
||||
preserve it and label it invalid or deprecated with a replacement path when one
|
||||
exists.
|
||||
|
||||
## Current Limits
|
||||
|
||||
- This package runs the existing raw collector; it does not add a daemon inside
|
||||
Python.
|
||||
- The systemd loop is a restart model around finite collector cycles.
|
||||
- It does not upload files.
|
||||
- It does not prove long-run reliability.
|
||||
- Production readiness remains blocked until discovery, raw collection, offload,
|
||||
and a documented 24h soak test all pass.
|
||||
366
scripts/build_vps_deploy_bundle.sh
Executable file
366
scripts/build_vps_deploy_bundle.sh
Executable file
|
|
@ -0,0 +1,366 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
APP_DIR="${ORDERBOOKS_APP_DIR:-$(pwd)}"
|
||||
OUTPUT_DIR="${ORDERBOOKS_VPS_BUNDLE_OUTPUT_DIR:-artifacts/vps}"
|
||||
TIMESTAMP="${ORDERBOOKS_VPS_BUNDLE_TIMESTAMP:-$(date -u +%Y%m%dT%H%M%SZ)}"
|
||||
BUNDLE_BASENAME="orderbooks_vps_deploy_${TIMESTAMP}"
|
||||
TARBALL="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}.tar.gz"
|
||||
MANIFEST="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}_manifest.json"
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: scripts/build_vps_deploy_bundle.sh [options]
|
||||
|
||||
Build a deployable VPS bundle from the current working tree. The bundle is
|
||||
intended to be copied to a VPS and unpacked under /opt/orderbooks.
|
||||
|
||||
Options:
|
||||
--app-dir DIR Source working tree. Default: ORDERBOOKS_APP_DIR or current directory.
|
||||
--output-dir DIR Bundle output directory. Default: artifacts/vps.
|
||||
--timestamp TS Override UTC timestamp used in artifact names.
|
||||
--help Show this help.
|
||||
|
||||
The bundle uses a narrow allowlist and excludes live data, caches, git metadata,
|
||||
virtualenvs, rclone config, private keys, wallets, mnemonics, and generated
|
||||
artifacts. It does not print secrets and does not write Python bytecode.
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--app-dir)
|
||||
APP_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--output-dir)
|
||||
OUTPUT_DIR="$2"
|
||||
TARBALL="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}.tar.gz"
|
||||
MANIFEST="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}_manifest.json"
|
||||
shift 2
|
||||
;;
|
||||
--timestamp)
|
||||
TIMESTAMP="$2"
|
||||
BUNDLE_BASENAME="orderbooks_vps_deploy_${TIMESTAMP}"
|
||||
TARBALL="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}.tar.gz"
|
||||
MANIFEST="${OUTPUT_DIR%/}/${BUNDLE_BASENAME}_manifest.json"
|
||||
shift 2
|
||||
;;
|
||||
--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
APP_DIR="${APP_DIR%/}"
|
||||
if [[ ! -d "${APP_DIR}" ]]; then
|
||||
echo "Source app directory does not exist: ${APP_DIR}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "${OUTPUT_DIR}"
|
||||
cd "${APP_DIR}"
|
||||
|
||||
if [[ -e "${TARBALL}" || -e "${MANIFEST}" ]]; then
|
||||
echo "Refusing to overwrite existing bundle artifact: ${TARBALL} or ${MANIFEST}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
FILELIST="$(mktemp)"
|
||||
trap 'rm -f "${FILELIST}"' EXIT
|
||||
|
||||
PYTHONDONTWRITEBYTECODE=1 python3 - "${FILELIST}" "${MANIFEST}" "${TARBALL}" "${TIMESTAMP}" <<'PY_BUNDLE_SELECT'
|
||||
import datetime as dt
|
||||
import fnmatch
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
filelist_path = Path(sys.argv[1])
|
||||
manifest_path = Path(sys.argv[2])
|
||||
tarball_path = Path(sys.argv[3])
|
||||
timestamp = sys.argv[4]
|
||||
root = Path.cwd()
|
||||
|
||||
allowed_files = [
|
||||
Path("AGENTS.md"),
|
||||
Path("ROADMAP.md"),
|
||||
]
|
||||
allowed_dirs = [
|
||||
Path("config"),
|
||||
Path("docs"),
|
||||
Path("scripts"),
|
||||
Path("systemd"),
|
||||
Path("reports/checkpoints"),
|
||||
]
|
||||
allowed_globs = [
|
||||
"data/manifests/checkpoint_*.json",
|
||||
]
|
||||
excluded_patterns = [
|
||||
".git/",
|
||||
".venv/",
|
||||
"artifacts/",
|
||||
"data/soak_test/",
|
||||
"data/live_sample/",
|
||||
"data/normalized_sample/",
|
||||
"**/__pycache__/",
|
||||
"**/*.pyc",
|
||||
"**/*.pyo",
|
||||
"**/.pytest_cache/",
|
||||
"**/.mypy_cache/",
|
||||
"**/.ruff_cache/",
|
||||
"**/rclone.conf",
|
||||
"**/.env",
|
||||
"**/*.pem",
|
||||
"**/*.key",
|
||||
"**/*.p12",
|
||||
"**/*.pfx",
|
||||
"**/id_rsa*",
|
||||
"**/id_ed25519*",
|
||||
"**/*mnemonic*",
|
||||
"**/*wallet*",
|
||||
"**/*credential*",
|
||||
"**/*secret*",
|
||||
]
|
||||
required_files = [
|
||||
"AGENTS.md",
|
||||
"ROADMAP.md",
|
||||
"config/polymarket_collector.vps.example.yaml",
|
||||
"config/rclone.example.md",
|
||||
"docs/VPS_CUTOVER_RUNBOOK.md",
|
||||
"docs/VPS_DEPLOYMENT.md",
|
||||
"docs/GOOGLE_DRIVE_OFFLOAD.md",
|
||||
"scripts/build_vps_deploy_bundle.sh",
|
||||
"scripts/vps_preflight_check.sh",
|
||||
"scripts/vps_runtime_smoke_check.sh",
|
||||
"scripts/run_polymarket_collector_cycle.sh",
|
||||
"scripts/upload_archive_rclone.sh",
|
||||
"scripts/discover_polymarket_btc_markets.py",
|
||||
"scripts/collect_polymarket_orderbooks.py",
|
||||
"scripts/normalize_polymarket_orderbooks.py",
|
||||
"systemd/polymarket-orderbook-collector.service",
|
||||
"systemd/polymarket-orderbook-uploader.service",
|
||||
"systemd/polymarket-orderbook-uploader.timer",
|
||||
]
|
||||
|
||||
forbidden_path_fragments = [
|
||||
"/.git/",
|
||||
"/.venv/",
|
||||
"/__pycache__/",
|
||||
"/data/soak_test/",
|
||||
"/data/live_sample/",
|
||||
"/data/normalized_sample/",
|
||||
"/artifacts/",
|
||||
]
|
||||
forbidden_names = {
|
||||
"rclone.conf",
|
||||
".env",
|
||||
"id_rsa",
|
||||
"id_ed25519",
|
||||
}
|
||||
forbidden_suffixes = {
|
||||
".pyc",
|
||||
".pyo",
|
||||
".pem",
|
||||
".key",
|
||||
".p12",
|
||||
".pfx",
|
||||
}
|
||||
secretish_name_tokens = [
|
||||
"mnemonic",
|
||||
"wallet",
|
||||
"credential",
|
||||
"secret",
|
||||
]
|
||||
|
||||
def as_posix(path: Path) -> str:
|
||||
return path.as_posix()
|
||||
|
||||
def is_forbidden(path: Path) -> tuple[bool, str | None]:
|
||||
rel = as_posix(path)
|
||||
wrapped = f"/{rel}/" if path.is_dir() else f"/{rel}"
|
||||
if path.is_absolute() or ".." in path.parts:
|
||||
return True, "absolute_or_parent_path"
|
||||
for fragment in forbidden_path_fragments:
|
||||
if fragment in wrapped:
|
||||
return True, f"forbidden_fragment:{fragment}"
|
||||
if any(part in {".git", ".venv", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache"} for part in path.parts):
|
||||
return True, "forbidden_cache_or_metadata_dir"
|
||||
lower_name = path.name.lower()
|
||||
if lower_name in forbidden_names:
|
||||
return True, f"forbidden_name:{path.name}"
|
||||
if path.suffix.lower() in forbidden_suffixes:
|
||||
return True, f"forbidden_suffix:{path.suffix}"
|
||||
if any(token in lower_name for token in secretish_name_tokens):
|
||||
return True, f"secretish_name:{path.name}"
|
||||
if rel.startswith(("data/soak_test/", "data/live_sample/", "data/normalized_sample/", "artifacts/")):
|
||||
return True, "forbidden_prefix"
|
||||
return False, None
|
||||
|
||||
def iter_allowed_files():
|
||||
seen = set()
|
||||
for path in allowed_files:
|
||||
if path.is_file() and path not in seen:
|
||||
seen.add(path)
|
||||
yield path
|
||||
for directory in allowed_dirs:
|
||||
if not directory.exists():
|
||||
continue
|
||||
for path in sorted(directory.rglob("*")):
|
||||
if path.is_file() and path not in seen:
|
||||
seen.add(path)
|
||||
yield path
|
||||
for pattern in allowed_globs:
|
||||
for path in sorted(root.glob(pattern)):
|
||||
if path.is_file() and path not in seen:
|
||||
seen.add(path)
|
||||
yield path
|
||||
|
||||
def sha256_file(path: Path) -> str:
|
||||
digest = hashlib.sha256()
|
||||
with path.open("rb") as handle:
|
||||
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
||||
digest.update(chunk)
|
||||
return digest.hexdigest()
|
||||
|
||||
included = []
|
||||
excluded = []
|
||||
for path in iter_allowed_files():
|
||||
forbidden, reason = is_forbidden(path)
|
||||
if forbidden:
|
||||
excluded.append({"path": as_posix(path), "reason": reason})
|
||||
continue
|
||||
stat = path.stat()
|
||||
included.append({
|
||||
"path": as_posix(path),
|
||||
"bytes": stat.st_size,
|
||||
"sha256": sha256_file(path),
|
||||
})
|
||||
|
||||
included_paths = sorted(item["path"] for item in included)
|
||||
missing_required = sorted(path for path in required_files if path not in included_paths)
|
||||
if missing_required:
|
||||
raise SystemExit(f"missing required bundle files: {missing_required}")
|
||||
if not included:
|
||||
raise SystemExit("bundle file list is empty")
|
||||
|
||||
filelist_path.write_bytes(b"".join(path.encode("utf-8") + b"\0" for path in included_paths))
|
||||
created_at = dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
||||
manifest = {
|
||||
"schema_name": "vps_deploy_bundle_manifest",
|
||||
"schema_version": 1,
|
||||
"created_at_utc": created_at,
|
||||
"timestamp": timestamp,
|
||||
"tarball_path": as_posix(tarball_path),
|
||||
"manifest_path": as_posix(manifest_path),
|
||||
"source_root": str(root),
|
||||
"bundle_intent": "Copy to a VPS and unpack under /opt/orderbooks; VPS execution remains pending.",
|
||||
"production_ready": False,
|
||||
"vps_deployed": False,
|
||||
"included_roots": [str(path) for path in allowed_files + allowed_dirs] + allowed_globs,
|
||||
"excluded_patterns": excluded_patterns,
|
||||
"required_files": required_files,
|
||||
"included_file_count": len(included),
|
||||
"included_files": included,
|
||||
"excluded_selected_files": excluded,
|
||||
"missing_required_files": missing_required,
|
||||
"validation": {
|
||||
"required_files_present_before_tar": not missing_required,
|
||||
"forbidden_paths_absent_before_tar": True,
|
||||
"tarball_validation_completed": False,
|
||||
},
|
||||
}
|
||||
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||
PY_BUNDLE_SELECT
|
||||
|
||||
tar --create --gzip --file "${TARBALL}" --null --files-from "${FILELIST}" --owner=0 --group=0 --numeric-owner
|
||||
|
||||
PYTHONDONTWRITEBYTECODE=1 python3 - "${TARBALL}" "${MANIFEST}" <<'PY_BUNDLE_VALIDATE'
|
||||
import hashlib
|
||||
import json
|
||||
import sys
|
||||
import tarfile
|
||||
from pathlib import Path
|
||||
|
||||
tarball_path = Path(sys.argv[1])
|
||||
manifest_path = Path(sys.argv[2])
|
||||
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
required_files = set(manifest["required_files"])
|
||||
|
||||
def sha256_file(path: Path) -> str:
|
||||
digest = hashlib.sha256()
|
||||
with path.open("rb") as handle:
|
||||
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
||||
digest.update(chunk)
|
||||
return digest.hexdigest()
|
||||
|
||||
def forbidden_reason(name: str) -> str | None:
|
||||
parts = name.split("/")
|
||||
lower_name = parts[-1].lower()
|
||||
if name.startswith("/") or any(part == ".." for part in parts):
|
||||
return "absolute_or_parent_path"
|
||||
if parts[0] in {".git", ".venv", "artifacts"}:
|
||||
return f"forbidden_top_level:{parts[0]}"
|
||||
if len(parts) >= 2 and parts[0] == "data" and parts[1] in {"soak_test", "live_sample", "normalized_sample"}:
|
||||
return f"forbidden_data_dir:data/{parts[1]}"
|
||||
if any(part in {".git", ".venv", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache"} for part in parts):
|
||||
return "forbidden_cache_or_metadata_dir"
|
||||
if lower_name in {"rclone.conf", ".env", "id_rsa", "id_ed25519"}:
|
||||
return f"forbidden_name:{lower_name}"
|
||||
if any(lower_name.endswith(suffix) for suffix in (".pyc", ".pyo", ".pem", ".key", ".p12", ".pfx")):
|
||||
return "forbidden_suffix"
|
||||
if any(token in lower_name for token in ("mnemonic", "wallet", "credential", "secret")):
|
||||
return "secretish_name"
|
||||
return None
|
||||
|
||||
with tarfile.open(tarball_path, "r:gz") as archive:
|
||||
members = [member for member in archive.getmembers() if member.isfile()]
|
||||
names = sorted(member.name for member in members)
|
||||
|
||||
forbidden = [{"path": name, "reason": forbidden_reason(name)} for name in names if forbidden_reason(name)]
|
||||
missing_required = sorted(required_files - set(names))
|
||||
if forbidden or missing_required:
|
||||
manifest["validation"].update({
|
||||
"tarball_validation_completed": True,
|
||||
"forbidden_paths_absent_in_tarball": not forbidden,
|
||||
"required_files_present_in_tarball": not missing_required,
|
||||
"forbidden_paths_in_tarball": forbidden,
|
||||
"missing_required_files_in_tarball": missing_required,
|
||||
})
|
||||
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||
raise SystemExit(f"bundle validation failed forbidden={forbidden} missing_required={missing_required}")
|
||||
|
||||
manifest["tarball_bytes"] = tarball_path.stat().st_size
|
||||
manifest["tarball_sha256"] = sha256_file(tarball_path)
|
||||
manifest["tarball_content_count"] = len(names)
|
||||
manifest["tarball_contents"] = names
|
||||
manifest["validation"].update({
|
||||
"tarball_validation_completed": True,
|
||||
"forbidden_paths_absent_in_tarball": True,
|
||||
"required_files_present_in_tarball": True,
|
||||
"forbidden_paths_in_tarball": [],
|
||||
"missing_required_files_in_tarball": [],
|
||||
})
|
||||
manifest["gate_status"] = "PASS"
|
||||
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||
PY_BUNDLE_VALIDATE
|
||||
|
||||
printf 'BUNDLE_TARBALL=%s\n' "${TARBALL}"
|
||||
printf 'BUNDLE_MANIFEST=%s\n' "${MANIFEST}"
|
||||
python3 - <<'PY_PRINT' "${MANIFEST}"
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
m = json.loads(Path(sys.argv[1]).read_text(encoding="utf-8"))
|
||||
print(f"BUNDLE_SHA256={m['tarball_sha256']}")
|
||||
print(f"BUNDLE_BYTES={m['tarball_bytes']}")
|
||||
print(f"BUNDLE_FILE_COUNT={m['tarball_content_count']}")
|
||||
PY_PRINT
|
||||
668
scripts/collect_polymarket_orderbooks.py
Executable file
668
scripts/collect_polymarket_orderbooks.py
Executable file
|
|
@ -0,0 +1,668 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Minimal raw Polymarket order-book snapshot sample collector.
|
||||
|
||||
Checkpoint 4 scope: finite sample run only. This script reads the BTC discovery
|
||||
artifact, fetches public CLOB batch order books for a small market set, writes
|
||||
raw gzip JSONL envelopes, and closes with a manifest. It is not a daemon and it
|
||||
does not trade.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import datetime as dt
|
||||
import gzip
|
||||
import hashlib
|
||||
import json
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
COLLECTOR_NAME = "polymarket_orderbook_collector"
|
||||
COLLECTOR_VERSION = "0.1.0"
|
||||
SCHEMA_NAME = "raw_orderbook_snapshot"
|
||||
SCHEMA_VERSION = 1
|
||||
CLOB_BOOKS_URL = "https://clob.polymarket.com/books"
|
||||
|
||||
DEFAULT_CONFIG_PATH = Path("config/polymarket_collector.example.yaml")
|
||||
DEFAULT_DISCOVERY_PATH = Path("data/discovery/polymarket_btc_markets_latest.json")
|
||||
DEFAULT_OUTPUT_DIR = Path("data/live_sample")
|
||||
DEFAULT_MANIFEST_PATH = Path("data/manifests/orderbook_collector_sample_manifest.json")
|
||||
|
||||
SAFE_RESPONSE_HEADERS = {
|
||||
"cache-control",
|
||||
"cf-cache-status",
|
||||
"cf-ray",
|
||||
"content-length",
|
||||
"content-type",
|
||||
"date",
|
||||
"retry-after",
|
||||
"server",
|
||||
"x-ratelimit-limit",
|
||||
"x-ratelimit-remaining",
|
||||
"x-ratelimit-reset",
|
||||
"ratelimit-limit",
|
||||
"ratelimit-remaining",
|
||||
"ratelimit-reset",
|
||||
}
|
||||
|
||||
STOP_REQUESTED = False
|
||||
STOP_SIGNAL: str | None = None
|
||||
|
||||
|
||||
def handle_stop(signum: int, _frame: Any) -> None:
|
||||
global STOP_REQUESTED, STOP_SIGNAL
|
||||
STOP_REQUESTED = True
|
||||
STOP_SIGNAL = signal.Signals(signum).name
|
||||
|
||||
|
||||
def utc_now() -> dt.datetime:
|
||||
return dt.datetime.now(dt.UTC)
|
||||
|
||||
|
||||
def iso_z(value: dt.datetime | None = None) -> str:
|
||||
value = value or utc_now()
|
||||
return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
||||
|
||||
|
||||
def compact_timestamp(value: dt.datetime | None = None) -> str:
|
||||
value = value or utc_now()
|
||||
return value.astimezone(dt.UTC).strftime("%Y%m%dT%H%M%SZ")
|
||||
|
||||
|
||||
def parse_iso(value: Any) -> dt.datetime | None:
|
||||
if not isinstance(value, str) or not value.strip():
|
||||
return None
|
||||
text = value.strip()
|
||||
if text.endswith("Z"):
|
||||
text = text[:-1] + "+00:00"
|
||||
try:
|
||||
parsed = dt.datetime.fromisoformat(text)
|
||||
except ValueError:
|
||||
return None
|
||||
if parsed.tzinfo is None:
|
||||
parsed = parsed.replace(tzinfo=dt.UTC)
|
||||
return parsed.astimezone(dt.UTC)
|
||||
|
||||
|
||||
def sha256_file(path: Path) -> str:
|
||||
digest = hashlib.sha256()
|
||||
with path.open("rb") as handle:
|
||||
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
||||
digest.update(chunk)
|
||||
return digest.hexdigest()
|
||||
|
||||
|
||||
def parse_scalar(value: str) -> Any:
|
||||
value = value.strip()
|
||||
if not value:
|
||||
return ""
|
||||
if value[0] in {"'", '"'} and value[-1:] == value[0]:
|
||||
return value[1:-1]
|
||||
lower = value.lower()
|
||||
if lower in {"true", "false"}:
|
||||
return lower == "true"
|
||||
if lower in {"null", "none"}:
|
||||
return None
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
return float(value)
|
||||
except ValueError:
|
||||
return value
|
||||
|
||||
|
||||
def load_flat_yaml(path: Path) -> dict[str, Any]:
|
||||
"""Parse the flat YAML subset used by the example config."""
|
||||
config: dict[str, Any] = {}
|
||||
if not path.exists():
|
||||
return config
|
||||
for line_number, raw_line in enumerate(path.read_text(encoding="utf-8").splitlines(), 1):
|
||||
line = raw_line.split("#", 1)[0].strip()
|
||||
if not line:
|
||||
continue
|
||||
if ":" not in line:
|
||||
raise ValueError(f"Unsupported config line {line_number}: {raw_line}")
|
||||
key, value = line.split(":", 1)
|
||||
key = key.strip()
|
||||
if not key:
|
||||
raise ValueError(f"Missing config key on line {line_number}")
|
||||
config[key] = parse_scalar(value)
|
||||
return config
|
||||
|
||||
|
||||
def config_digest(path: Path | None) -> str | None:
|
||||
if path is None or not path.exists():
|
||||
return None
|
||||
return sha256_file(path)
|
||||
|
||||
|
||||
def filter_headers(headers: Any) -> dict[str, str]:
|
||||
safe: dict[str, str] = {}
|
||||
for key, value in dict(headers).items():
|
||||
if key.lower() in SAFE_RESPONSE_HEADERS:
|
||||
safe[key] = value
|
||||
return safe
|
||||
|
||||
|
||||
def http_post_json(
|
||||
*,
|
||||
url: str,
|
||||
json_body: Any,
|
||||
timeout_seconds: float,
|
||||
max_retries: int,
|
||||
backoff_seconds: float,
|
||||
) -> dict[str, Any]:
|
||||
body_bytes = json.dumps(json_body, separators=(",", ":")).encode("utf-8")
|
||||
attempts: list[dict[str, Any]] = []
|
||||
final_json: Any | None = None
|
||||
final_text_preview: str | None = None
|
||||
final_json_error: str | None = None
|
||||
final_status_code: int | None = None
|
||||
final_headers: dict[str, str] = {}
|
||||
|
||||
for attempt_index in range(max_retries + 1):
|
||||
started_at = iso_z()
|
||||
started_monotonic = time.monotonic()
|
||||
status_code: int | None = None
|
||||
response_headers: dict[str, str] = {}
|
||||
response_text = ""
|
||||
error: str | None = None
|
||||
try:
|
||||
request = urllib.request.Request(
|
||||
url,
|
||||
data=body_bytes,
|
||||
headers={
|
||||
"Accept": "application/json",
|
||||
"Content-Type": "application/json",
|
||||
"User-Agent": "orderbooks-checkpoint-4-sample/0.1.0",
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
|
||||
status_code = response.status
|
||||
response_headers = filter_headers(response.headers)
|
||||
response_text = response.read().decode("utf-8", errors="replace")
|
||||
except urllib.error.HTTPError as exc:
|
||||
status_code = exc.code
|
||||
response_headers = filter_headers(exc.headers)
|
||||
response_text = exc.read().decode("utf-8", errors="replace")
|
||||
error = f"HTTPError: {exc}"
|
||||
except Exception as exc: # noqa: BLE001 - preserve request failure evidence
|
||||
error = f"{type(exc).__name__}: {exc}"
|
||||
|
||||
duration_ms = round((time.monotonic() - started_monotonic) * 1000, 3)
|
||||
parsed_json = None
|
||||
json_error = None
|
||||
if response_text:
|
||||
try:
|
||||
parsed_json = json.loads(response_text)
|
||||
except json.JSONDecodeError as exc:
|
||||
json_error = str(exc)
|
||||
|
||||
attempts.append(
|
||||
{
|
||||
"attempt": attempt_index + 1,
|
||||
"started_at_utc": started_at,
|
||||
"ended_at_utc": iso_z(),
|
||||
"duration_ms": duration_ms,
|
||||
"status_code": status_code,
|
||||
"headers": response_headers,
|
||||
"error": error,
|
||||
"json_error": json_error,
|
||||
}
|
||||
)
|
||||
final_json = parsed_json
|
||||
final_json_error = json_error
|
||||
final_text_preview = response_text[:1000] if parsed_json is None else None
|
||||
final_status_code = status_code
|
||||
final_headers = response_headers
|
||||
|
||||
retryable = status_code == 429 or (status_code is not None and 500 <= status_code <= 599)
|
||||
if error is None and status_code is not None and 200 <= status_code < 300:
|
||||
break
|
||||
if not retryable or attempt_index >= max_retries or STOP_REQUESTED:
|
||||
break
|
||||
retry_after = response_headers.get("Retry-After") or response_headers.get("retry-after")
|
||||
sleep_seconds = backoff_seconds * (2**attempt_index)
|
||||
if retry_after:
|
||||
try:
|
||||
sleep_seconds = max(sleep_seconds, float(retry_after))
|
||||
except ValueError:
|
||||
pass
|
||||
time.sleep(sleep_seconds)
|
||||
|
||||
return {
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"url": url,
|
||||
"json_body": json_body,
|
||||
},
|
||||
"response": {
|
||||
"status_code": final_status_code,
|
||||
"headers": final_headers,
|
||||
"json": final_json,
|
||||
"json_error": final_json_error,
|
||||
"text_preview": final_text_preview,
|
||||
},
|
||||
"attempts": attempts,
|
||||
"duration_ms": round(sum(attempt["duration_ms"] for attempt in attempts), 3),
|
||||
"ok": final_status_code is not None and 200 <= final_status_code < 300 and final_json_error is None,
|
||||
}
|
||||
|
||||
|
||||
def load_discovery(path: Path) -> dict[str, Any]:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def market_is_usable(market: dict[str, Any], now: dt.datetime, safety_seconds: int) -> tuple[bool, list[str]]:
|
||||
reasons: list[str] = []
|
||||
if market.get("active") is not True:
|
||||
reasons.append("not_active")
|
||||
if market.get("closed") is not False:
|
||||
reasons.append("closed")
|
||||
if market.get("accepting_orders") is not True:
|
||||
reasons.append("not_accepting_orders")
|
||||
if market.get("enable_order_book") is not True:
|
||||
reasons.append("order_book_not_enabled")
|
||||
end_time = parse_iso(market.get("end_time_utc"))
|
||||
if end_time is None:
|
||||
reasons.append("missing_end_time")
|
||||
elif end_time <= now + dt.timedelta(seconds=safety_seconds):
|
||||
reasons.append("too_close_to_end_or_expired")
|
||||
tokens = market.get("tokens")
|
||||
if not isinstance(tokens, list) or len(tokens) < 2:
|
||||
reasons.append("missing_two_tokens")
|
||||
else:
|
||||
outcomes = [token.get("outcome") for token in tokens if isinstance(token, dict)]
|
||||
token_ids = [token.get("token_id") for token in tokens if isinstance(token, dict)]
|
||||
if outcomes[:2] != ["Up", "Down"] or not all(token_ids[:2]):
|
||||
reasons.append("bad_up_down_token_mapping")
|
||||
return not reasons, reasons
|
||||
|
||||
|
||||
def select_markets(
|
||||
discovery: dict[str, Any],
|
||||
*,
|
||||
market_limit: int,
|
||||
market_end_safety_seconds: int,
|
||||
) -> tuple[list[dict[str, Any]], dict[str, int]]:
|
||||
now = utc_now()
|
||||
selected: list[dict[str, Any]] = []
|
||||
rejection_counts: dict[str, int] = {}
|
||||
markets = discovery.get("normalized_markets") or []
|
||||
for market in markets:
|
||||
if not isinstance(market, dict):
|
||||
rejection_counts["not_object"] = rejection_counts.get("not_object", 0) + 1
|
||||
continue
|
||||
usable, reasons = market_is_usable(market, now, market_end_safety_seconds)
|
||||
if not usable:
|
||||
for reason in reasons:
|
||||
rejection_counts[reason] = rejection_counts.get(reason, 0) + 1
|
||||
continue
|
||||
selected.append(market)
|
||||
if len(selected) >= market_limit:
|
||||
break
|
||||
return selected, dict(sorted(rejection_counts.items()))
|
||||
|
||||
|
||||
def flatten_tokens(markets: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
tokens: list[dict[str, Any]] = []
|
||||
for market in markets:
|
||||
for token in market.get("tokens", [])[:2]:
|
||||
tokens.append(
|
||||
{
|
||||
"market_name": market.get("market_name"),
|
||||
"market_slug": market.get("market_slug"),
|
||||
"condition_id": market.get("condition_id"),
|
||||
"token_id": str(token.get("token_id")),
|
||||
"outcome": token.get("outcome"),
|
||||
"market_end_time_utc": market.get("end_time_utc"),
|
||||
}
|
||||
)
|
||||
return tokens
|
||||
|
||||
|
||||
def build_snapshot_envelope(
|
||||
*,
|
||||
raw_book: dict[str, Any],
|
||||
token_meta: dict[str, Any],
|
||||
collected_at_utc: str,
|
||||
sequence: int,
|
||||
request_record: dict[str, Any],
|
||||
response_index: int,
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"schema_name": SCHEMA_NAME,
|
||||
"schema_version": SCHEMA_VERSION,
|
||||
"collector": {
|
||||
"name": COLLECTOR_NAME,
|
||||
"version": COLLECTOR_VERSION,
|
||||
},
|
||||
"market": {
|
||||
"market_name": token_meta.get("market_name"),
|
||||
"market_slug": token_meta.get("market_slug"),
|
||||
"condition_id": token_meta.get("condition_id"),
|
||||
"token_id": token_meta.get("token_id"),
|
||||
"outcome": token_meta.get("outcome"),
|
||||
"market_end_time_utc": token_meta.get("market_end_time_utc"),
|
||||
},
|
||||
"collection": {
|
||||
"collected_at_utc": collected_at_utc,
|
||||
"sequence": sequence,
|
||||
"response_index": response_index,
|
||||
},
|
||||
"request": {
|
||||
"method": request_record["request"]["method"],
|
||||
"url": request_record["request"]["url"],
|
||||
"params": None,
|
||||
"json_body": request_record["request"]["json_body"],
|
||||
"status_code": request_record["response"]["status_code"],
|
||||
"duration_ms": request_record["duration_ms"],
|
||||
"attempts": request_record["attempts"],
|
||||
},
|
||||
"raw": raw_book,
|
||||
}
|
||||
|
||||
|
||||
def summarize_output_file(path: Path, rows_written: int) -> dict[str, Any]:
|
||||
return {
|
||||
"path": path.as_posix(),
|
||||
"status": "valid" if path.exists() and path.stat().st_size > 0 else "missing",
|
||||
"bytes": path.stat().st_size if path.exists() else 0,
|
||||
"rows": rows_written,
|
||||
"sha256": sha256_file(path) if path.exists() else None,
|
||||
}
|
||||
|
||||
|
||||
def write_manifest(path: Path, manifest: dict[str, Any]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def config_value(config: dict[str, Any], args: argparse.Namespace, key: str, default: Any) -> Any:
|
||||
cli_value = getattr(args, key)
|
||||
if cli_value is not None:
|
||||
return cli_value
|
||||
return config.get(key, default)
|
||||
|
||||
|
||||
def build_runtime_config(args: argparse.Namespace) -> dict[str, Any]:
|
||||
config_path = args.config
|
||||
file_config = load_flat_yaml(config_path) if config_path else {}
|
||||
runtime = {
|
||||
"discovery_path": Path(config_value(file_config, args, "discovery_path", DEFAULT_DISCOVERY_PATH)),
|
||||
"output_dir": Path(config_value(file_config, args, "output_dir", DEFAULT_OUTPUT_DIR)),
|
||||
"manifest_path": Path(config_value(file_config, args, "manifest_path", DEFAULT_MANIFEST_PATH)),
|
||||
"market_limit": int(config_value(file_config, args, "market_limit", 2)),
|
||||
"interval_seconds": float(config_value(file_config, args, "interval_seconds", 30.0)),
|
||||
"duration_seconds": float(config_value(file_config, args, "duration_seconds", 300.0)),
|
||||
"request_timeout_seconds": float(config_value(file_config, args, "request_timeout_seconds", 15.0)),
|
||||
"max_retries": int(config_value(file_config, args, "max_retries", 2)),
|
||||
"backoff_seconds": float(config_value(file_config, args, "backoff_seconds", 2.0)),
|
||||
"market_end_safety_seconds": int(config_value(file_config, args, "market_end_safety_seconds", 420)),
|
||||
"clob_books_url": str(config_value(file_config, args, "clob_books_url", CLOB_BOOKS_URL)),
|
||||
"config_path": config_path,
|
||||
"config_sha256": config_digest(config_path),
|
||||
"config_snapshot": file_config,
|
||||
}
|
||||
if runtime["market_limit"] < 1:
|
||||
raise ValueError("market_limit must be >= 1")
|
||||
if runtime["interval_seconds"] <= 0:
|
||||
raise ValueError("interval_seconds must be > 0")
|
||||
if runtime["duration_seconds"] <= 0:
|
||||
raise ValueError("duration_seconds must be > 0")
|
||||
return runtime
|
||||
|
||||
|
||||
def run_collection(runtime: dict[str, Any], command: str) -> tuple[dict[str, Any], Path]:
|
||||
signal.signal(signal.SIGINT, handle_stop)
|
||||
signal.signal(signal.SIGTERM, handle_stop)
|
||||
|
||||
started = utc_now()
|
||||
started_at_utc = iso_z(started)
|
||||
discovery_path: Path = runtime["discovery_path"]
|
||||
discovery = load_discovery(discovery_path)
|
||||
selected_markets, rejection_counts = select_markets(
|
||||
discovery,
|
||||
market_limit=runtime["market_limit"],
|
||||
market_end_safety_seconds=runtime["market_end_safety_seconds"],
|
||||
)
|
||||
warnings: list[str] = []
|
||||
failures: list[dict[str, Any]] = []
|
||||
if not selected_markets:
|
||||
warnings.append("No usable active BTC markets found in discovery input.")
|
||||
|
||||
tokens = flatten_tokens(selected_markets)
|
||||
run_id = compact_timestamp(started)
|
||||
output_dir = runtime["output_dir"] / "polymarket" / "orderbooks" / run_id
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
output_file = output_dir / f"polymarket_orderbooks_{run_id}.jsonl.gz"
|
||||
|
||||
request_count = 0
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
status_code_counts: dict[str, int] = {}
|
||||
rows_written = 0
|
||||
sequence = 0
|
||||
token_row_counts = {token["token_id"]: 0 for token in tokens}
|
||||
|
||||
deadline = time.monotonic() + runtime["duration_seconds"]
|
||||
token_by_id = {token["token_id"]: token for token in tokens}
|
||||
request_body = [{"token_id": token["token_id"]} for token in tokens]
|
||||
|
||||
with gzip.open(output_file, "wt", encoding="utf-8") as handle:
|
||||
while tokens and not STOP_REQUESTED and time.monotonic() < deadline:
|
||||
loop_started = time.monotonic()
|
||||
collected_at_utc = iso_z()
|
||||
request_count += 1
|
||||
request_record = http_post_json(
|
||||
url=runtime["clob_books_url"],
|
||||
json_body=request_body,
|
||||
timeout_seconds=runtime["request_timeout_seconds"],
|
||||
max_retries=runtime["max_retries"],
|
||||
backoff_seconds=runtime["backoff_seconds"],
|
||||
)
|
||||
status_code = request_record["response"]["status_code"]
|
||||
status_key = str(status_code)
|
||||
status_code_counts[status_key] = status_code_counts.get(status_key, 0) + 1
|
||||
if request_record["ok"] and isinstance(request_record["response"]["json"], list):
|
||||
success_count += 1
|
||||
for response_index, raw_book in enumerate(request_record["response"]["json"]):
|
||||
if not isinstance(raw_book, dict):
|
||||
failure_count += 1
|
||||
failures.append(
|
||||
{
|
||||
"collected_at_utc": collected_at_utc,
|
||||
"reason": "book_response_item_not_object",
|
||||
"response_index": response_index,
|
||||
}
|
||||
)
|
||||
continue
|
||||
asset_id = str(raw_book.get("asset_id") or "")
|
||||
token_meta = token_by_id.get(asset_id)
|
||||
if token_meta is None:
|
||||
failure_count += 1
|
||||
failures.append(
|
||||
{
|
||||
"collected_at_utc": collected_at_utc,
|
||||
"reason": "unknown_asset_id_in_book_response",
|
||||
"asset_id": asset_id,
|
||||
}
|
||||
)
|
||||
continue
|
||||
sequence += 1
|
||||
envelope = build_snapshot_envelope(
|
||||
raw_book=raw_book,
|
||||
token_meta=token_meta,
|
||||
collected_at_utc=collected_at_utc,
|
||||
sequence=sequence,
|
||||
request_record=request_record,
|
||||
response_index=response_index,
|
||||
)
|
||||
handle.write(json.dumps(envelope, separators=(",", ":"), sort_keys=True) + "\n")
|
||||
rows_written += 1
|
||||
token_row_counts[asset_id] = token_row_counts.get(asset_id, 0) + 1
|
||||
handle.flush()
|
||||
else:
|
||||
failure_count += 1
|
||||
failures.append(
|
||||
{
|
||||
"collected_at_utc": collected_at_utc,
|
||||
"reason": "request_failed_or_non_json_list",
|
||||
"status_code": status_code,
|
||||
"attempts": request_record["attempts"],
|
||||
"json_error": request_record["response"]["json_error"],
|
||||
"text_preview": request_record["response"]["text_preview"],
|
||||
}
|
||||
)
|
||||
|
||||
remaining_interval = runtime["interval_seconds"] - (time.monotonic() - loop_started)
|
||||
while remaining_interval > 0 and not STOP_REQUESTED and time.monotonic() < deadline:
|
||||
sleep_for = min(remaining_interval, deadline - time.monotonic(), 1.0)
|
||||
if sleep_for <= 0:
|
||||
break
|
||||
time.sleep(sleep_for)
|
||||
remaining_interval = runtime["interval_seconds"] - (time.monotonic() - loop_started)
|
||||
|
||||
ended = utc_now()
|
||||
ended_at_utc = iso_z(ended)
|
||||
duration_seconds_actual = round((ended - started).total_seconds(), 3)
|
||||
if STOP_REQUESTED:
|
||||
warnings.append(f"Graceful shutdown requested by {STOP_SIGNAL}.")
|
||||
if runtime["duration_seconds"] < 300:
|
||||
warnings.append("Configured run duration was shorter than the roadmap 5-minute sample target.")
|
||||
if not failures and request_count > 0:
|
||||
failures = []
|
||||
output_summary = summarize_output_file(output_file, rows_written)
|
||||
gate_status = "PASS" if rows_written > 0 and all(count > 0 for count in token_row_counts.values()) else "FAIL"
|
||||
if not tokens:
|
||||
gate_status = "BLOCKED"
|
||||
if request_count == 0:
|
||||
gate_status = "FAIL" if tokens else "BLOCKED"
|
||||
manifest = {
|
||||
"schema_name": "orderbook_collector_sample_manifest",
|
||||
"schema_version": 1,
|
||||
"checkpoint_id": 4,
|
||||
"checkpoint_name": "Minimal Orderbook Snapshot Collector",
|
||||
"gate_status": gate_status,
|
||||
"collector": {
|
||||
"name": COLLECTOR_NAME,
|
||||
"version": COLLECTOR_VERSION,
|
||||
},
|
||||
"started_at_utc": started_at_utc,
|
||||
"ended_at_utc": ended_at_utc,
|
||||
"run_duration_seconds": duration_seconds_actual,
|
||||
"configured_duration_seconds": runtime["duration_seconds"],
|
||||
"interval_seconds": runtime["interval_seconds"],
|
||||
"command": command,
|
||||
"config": {
|
||||
"path": runtime["config_path"].as_posix() if runtime["config_path"] else None,
|
||||
"sha256": runtime["config_sha256"],
|
||||
"snapshot": runtime["config_snapshot"],
|
||||
"effective": {
|
||||
"discovery_path": discovery_path.as_posix(),
|
||||
"output_dir": runtime["output_dir"].as_posix(),
|
||||
"manifest_path": runtime["manifest_path"].as_posix(),
|
||||
"market_limit": runtime["market_limit"],
|
||||
"interval_seconds": runtime["interval_seconds"],
|
||||
"duration_seconds": runtime["duration_seconds"],
|
||||
"request_timeout_seconds": runtime["request_timeout_seconds"],
|
||||
"max_retries": runtime["max_retries"],
|
||||
"backoff_seconds": runtime["backoff_seconds"],
|
||||
"market_end_safety_seconds": runtime["market_end_safety_seconds"],
|
||||
"clob_books_url": runtime["clob_books_url"],
|
||||
},
|
||||
},
|
||||
"discovery": {
|
||||
"path": discovery_path.as_posix(),
|
||||
"fetched_at_utc": discovery.get("fetched_at_utc"),
|
||||
"source_summary": discovery.get("summary"),
|
||||
"rejection_counts_before_selection": rejection_counts,
|
||||
},
|
||||
"markets_tracked": [
|
||||
{
|
||||
"market_name": market.get("market_name"),
|
||||
"market_slug": market.get("market_slug"),
|
||||
"condition_id": market.get("condition_id"),
|
||||
"end_time_utc": market.get("end_time_utc"),
|
||||
}
|
||||
for market in selected_markets
|
||||
],
|
||||
"tokens_tracked": tokens,
|
||||
"request_count": request_count,
|
||||
"success_count": success_count,
|
||||
"failure_count": failure_count,
|
||||
"status_code_counts": dict(sorted(status_code_counts.items())),
|
||||
"rows_written": rows_written,
|
||||
"token_row_counts": token_row_counts,
|
||||
"output_files": [output_summary],
|
||||
"failures": failures,
|
||||
"warnings": warnings,
|
||||
"known_gaps": [
|
||||
"This is a short run-rotated sample, not a daemon.",
|
||||
"Hourly rotation is documented but not implemented in this checkpoint.",
|
||||
"No websocket capture, normalization, upload, systemd unit, dashboard, database, or trading behavior is included.",
|
||||
"A 5-minute sample proves file-writing behavior only; it does not prove 24/7 reliability.",
|
||||
],
|
||||
"fake_progress_risk": "A small successful sample can still hide long-run gaps, stale discovery, endpoint schema drift, and missed intervals. Reliability remains gated on the future 24h soak test.",
|
||||
"next_step": "Checkpoint 5 should normalize this raw sample while preserving raw file references, or rerun a fresh short sample if the orchestrator wants more raw evidence first.",
|
||||
}
|
||||
return manifest, output_file
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Collect a bounded raw gzip JSONL sample of Polymarket BTC order books."
|
||||
)
|
||||
parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH)
|
||||
parser.add_argument("--discovery-path", type=Path, default=None)
|
||||
parser.add_argument("--output-dir", type=Path, default=None)
|
||||
parser.add_argument("--manifest-path", type=Path, default=None)
|
||||
parser.add_argument("--market-limit", type=int, default=None)
|
||||
parser.add_argument("--interval-seconds", type=float, default=None)
|
||||
parser.add_argument("--duration-seconds", type=float, default=None)
|
||||
parser.add_argument("--request-timeout-seconds", type=float, default=None)
|
||||
parser.add_argument("--max-retries", type=int, default=None)
|
||||
parser.add_argument("--backoff-seconds", type=float, default=None)
|
||||
parser.add_argument("--market-end-safety-seconds", type=int, default=None)
|
||||
parser.add_argument("--clob-books-url", type=str, default=None)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
command = " ".join([Path(sys.argv[0]).as_posix(), *sys.argv[1:]])
|
||||
runtime = build_runtime_config(args)
|
||||
manifest, output_file = run_collection(runtime, command)
|
||||
write_manifest(runtime["manifest_path"], manifest)
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"gate_status": manifest["gate_status"],
|
||||
"manifest_path": runtime["manifest_path"].as_posix(),
|
||||
"output_file": output_file.as_posix(),
|
||||
"markets_tracked": manifest["markets_tracked"],
|
||||
"tokens_tracked": len(manifest["tokens_tracked"]),
|
||||
"request_count": manifest["request_count"],
|
||||
"success_count": manifest["success_count"],
|
||||
"failure_count": manifest["failure_count"],
|
||||
"rows_written": manifest["rows_written"],
|
||||
"warnings": manifest["warnings"],
|
||||
},
|
||||
indent=2,
|
||||
sort_keys=True,
|
||||
)
|
||||
)
|
||||
return 0 if manifest["gate_status"] == "PASS" else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
146
scripts/deploy/bootstrap_orderbooks_k8s.sh
Executable file
146
scripts/deploy/bootstrap_orderbooks_k8s.sh
Executable file
|
|
@ -0,0 +1,146 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_DIR="$(cd "$(dirname "$0")/../.." && pwd)"
|
||||
PLATFORM_REPO_DIR="${PLATFORM_REPO_DIR:-/home/philipp/dev/ae/nuri/unrip3}"
|
||||
PLATFORM_ENV_FILE="${PLATFORM_ENV_FILE:-$PLATFORM_REPO_DIR/scripts/hetzner/bootstrap-secrets.env}"
|
||||
PLATFORM_RESOLVED_ENV_FILE="${PLATFORM_RESOLVED_ENV_FILE:-$PLATFORM_REPO_DIR/.state/hetzner/bootstrap-secrets.resolved.env}"
|
||||
KUBECONFIG_PATH="${KUBECONFIG_PATH:-$PLATFORM_REPO_DIR/.state/hetzner/kubeconfig.yaml}"
|
||||
CI_KUBECONFIG_PATH="${CI_KUBECONFIG_PATH:-$PLATFORM_REPO_DIR/.state/hetzner/kubeconfig.incluster.yaml}"
|
||||
|
||||
PROJECT_NAME="${PROJECT_NAME:-orderbooks}"
|
||||
PROJECT_NAMESPACE="${PROJECT_NAMESPACE:-orderbooks}"
|
||||
PROJECT_DEPLOYMENTS="${PROJECT_DEPLOYMENTS:-orderbooks-collector}"
|
||||
PROJECT_REGISTRY_SECRET_NAME="${PROJECT_REGISTRY_SECRET_NAME:-orderbooks-registry-creds}"
|
||||
RCLONE_SECRET_NAME="${RCLONE_SECRET_NAME:-orderbooks-rclone-config}"
|
||||
RCLONE_SECRET_KEY="${RCLONE_SECRET_KEY:-rclone.conf}"
|
||||
FORGEJO_REPO_OWNER="${FORGEJO_REPO_OWNER:-philipp}"
|
||||
FORGEJO_REPO_NAME="${FORGEJO_REPO_NAME:-orderbooks}"
|
||||
FORGEJO_REPO_PRIVATE="${FORGEJO_REPO_PRIVATE:-0}"
|
||||
|
||||
require() {
|
||||
command -v "$1" >/dev/null 2>&1 || {
|
||||
echo "missing required command: $1" >&2
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
load_env_defaults() {
|
||||
local file="$1"
|
||||
[[ -f "$file" ]] || return 0
|
||||
eval "$(
|
||||
python3 - "$file" <<'PY_LOAD_ENV'
|
||||
import os
|
||||
import shlex
|
||||
import sys
|
||||
|
||||
for raw in open(sys.argv[1], 'r', encoding='utf-8'):
|
||||
line = raw.strip()
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
if line.startswith('export '):
|
||||
line = line[len('export '):]
|
||||
if '=' not in line:
|
||||
continue
|
||||
key, value = line.split('=', 1)
|
||||
key = key.strip()
|
||||
value = value.strip()
|
||||
if len(value) >= 2 and value[0] == value[-1] and value[0] in {'\"', "'"}:
|
||||
value = value[1:-1]
|
||||
if key in os.environ:
|
||||
continue
|
||||
print(f'export {key}={shlex.quote(value)}')
|
||||
PY_LOAD_ENV
|
||||
)"
|
||||
}
|
||||
|
||||
require kubectl
|
||||
require python3
|
||||
require base64
|
||||
|
||||
load_env_defaults "$PLATFORM_ENV_FILE"
|
||||
load_env_defaults "$PLATFORM_RESOLVED_ENV_FILE"
|
||||
|
||||
# Force orderbooks app identity after loading platform defaults. The platform
|
||||
# env file may describe the platform repo itself, not this app repo.
|
||||
PROJECT_NAME="${ORDERBOOKS_PROJECT_NAME:-orderbooks}"
|
||||
PROJECT_NAMESPACE="${ORDERBOOKS_PROJECT_NAMESPACE:-orderbooks}"
|
||||
PROJECT_DEPLOYMENTS="${ORDERBOOKS_PROJECT_DEPLOYMENTS:-orderbooks-collector}"
|
||||
PROJECT_REGISTRY_SECRET_NAME="${ORDERBOOKS_PROJECT_REGISTRY_SECRET_NAME:-orderbooks-registry-creds}"
|
||||
RCLONE_SECRET_NAME="${ORDERBOOKS_RCLONE_SECRET_NAME:-orderbooks-rclone-config}"
|
||||
RCLONE_SECRET_KEY="${ORDERBOOKS_RCLONE_SECRET_KEY:-rclone.conf}"
|
||||
FORGEJO_REPO_OWNER="${ORDERBOOKS_FORGEJO_REPO_OWNER:-philipp}"
|
||||
FORGEJO_REPO_NAME="${ORDERBOOKS_FORGEJO_REPO_NAME:-orderbooks}"
|
||||
FORGEJO_REPO_PRIVATE="${ORDERBOOKS_FORGEJO_REPO_PRIVATE:-0}"
|
||||
|
||||
: "${KUBECONFIG_PATH:?missing kubeconfig path}"
|
||||
: "${CI_KUBECONFIG_PATH:?missing CI kubeconfig path}"
|
||||
[[ -f "$KUBECONFIG_PATH" ]] || { echo "missing kubeconfig file" >&2; exit 1; }
|
||||
[[ -f "$CI_KUBECONFIG_PATH" ]] || { echo "missing in-cluster kubeconfig file" >&2; exit 1; }
|
||||
export KUBECONFIG="$KUBECONFIG_PATH"
|
||||
|
||||
if [[ -z "${FORGEJO_URL:-}" ]]; then
|
||||
if [[ -n "${FORGEJO_ROOT_URL:-}" ]]; then
|
||||
FORGEJO_URL="$FORGEJO_ROOT_URL"
|
||||
elif [[ -n "${FORGEJO_DOMAIN:-}" ]]; then
|
||||
FORGEJO_URL="https://${FORGEJO_DOMAIN}"
|
||||
else
|
||||
echo "missing Forgejo URL" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
: "${FORGEJO_ADMIN_USERNAME:?missing Forgejo admin username}"
|
||||
if [[ -z "${FORGEJO_TOKEN:-}" ]]; then
|
||||
: "${FORGEJO_ADMIN_PASSWORD:?missing Forgejo password or token}"
|
||||
fi
|
||||
|
||||
if [[ -z "${REGISTRY_HOST:-}" ]]; then
|
||||
if [[ -n "${REGISTRY_DOMAIN:-}" ]]; then
|
||||
REGISTRY_HOST="$REGISTRY_DOMAIN"
|
||||
else
|
||||
echo "missing registry host" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
: "${REGISTRY_USERNAME:?missing registry username}"
|
||||
: "${REGISTRY_PASSWORD:?missing registry password}"
|
||||
|
||||
echo "ensuring namespace ${PROJECT_NAMESPACE}"
|
||||
kubectl create namespace "$PROJECT_NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
echo "upserting registry secret ${PROJECT_REGISTRY_SECRET_NAME}"
|
||||
kubectl -n "$PROJECT_NAMESPACE" create secret docker-registry "$PROJECT_REGISTRY_SECRET_NAME" \
|
||||
--docker-server="$REGISTRY_HOST" \
|
||||
--docker-username="$REGISTRY_USERNAME" \
|
||||
--docker-password="$REGISTRY_PASSWORD" \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
echo "checking rclone secret key presence"
|
||||
kubectl -n "$PROJECT_NAMESPACE" get secret "$RCLONE_SECRET_NAME" \
|
||||
-o "go-template={{if index .data \"${RCLONE_SECRET_KEY}\"}}rclone_secret_key_present{{else}}rclone_secret_key_missing{{end}}{{\"\\n\"}}"
|
||||
|
||||
echo "upserting Forgejo repo and Actions settings"
|
||||
forgejo_args=()
|
||||
if [[ -n "${FORGEJO_TOKEN:-}" ]]; then
|
||||
forgejo_args+=(--token "$FORGEJO_TOKEN")
|
||||
else
|
||||
forgejo_args+=(--admin-username "$FORGEJO_ADMIN_USERNAME" --admin-password "$FORGEJO_ADMIN_PASSWORD")
|
||||
fi
|
||||
if [[ "$FORGEJO_REPO_PRIVATE" == "1" || "$FORGEJO_REPO_PRIVATE" == "true" ]]; then
|
||||
forgejo_args+=(--repo-private)
|
||||
fi
|
||||
|
||||
python3 "$ROOT_DIR/scripts/deploy/forgejo_repo_bootstrap.py" \
|
||||
--forgejo-url "$FORGEJO_URL" \
|
||||
--repo-owner "$FORGEJO_REPO_OWNER" \
|
||||
--repo-name "$FORGEJO_REPO_NAME" \
|
||||
--ci-kubeconfig "$CI_KUBECONFIG_PATH" \
|
||||
--registry-host "$REGISTRY_HOST" \
|
||||
--project-name "$PROJECT_NAME" \
|
||||
--project-namespace "$PROJECT_NAMESPACE" \
|
||||
--project-deployments "$PROJECT_DEPLOYMENTS" \
|
||||
--project-registry-secret-name "$PROJECT_REGISTRY_SECRET_NAME" \
|
||||
"${forgejo_args[@]}"
|
||||
|
||||
echo "bootstrap complete for ${FORGEJO_REPO_OWNER}/${FORGEJO_REPO_NAME} in namespace ${PROJECT_NAMESPACE}"
|
||||
121
scripts/deploy/forgejo_repo_bootstrap.py
Executable file
121
scripts/deploy/forgejo_repo_bootstrap.py
Executable file
|
|
@ -0,0 +1,121 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Orderbooks-specific Forgejo repo bootstrap.
|
||||
|
||||
Creates/updates the Forgejo repository plus Actions settings for the Kubernetes
|
||||
orderbooks deployment. This script deliberately does not print secret values.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import ssl
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class ForgejoClient:
|
||||
def __init__(self, base_url: str, username: str | None = None, password: str | None = None, token: str | None = None):
|
||||
self.base_url = base_url.rstrip('/')
|
||||
self.username = username or ''
|
||||
self.headers = {'Accept': 'application/json', 'Content-Type': 'application/json'}
|
||||
if token:
|
||||
self.headers['Authorization'] = f'token {token}'
|
||||
elif username is not None and password is not None:
|
||||
credentials = base64.b64encode(f'{username}:{password}'.encode()).decode()
|
||||
self.headers['Authorization'] = f'Basic {credentials}'
|
||||
else:
|
||||
raise ValueError('ForgejoClient requires either token auth or username/password auth')
|
||||
self.ssl_context = ssl.create_default_context()
|
||||
|
||||
def request(self, method: str, path: str, payload=None, expected=(200, 201, 204)):
|
||||
data = json.dumps(payload).encode() if payload is not None else None
|
||||
req = urllib.request.Request(f'{self.base_url}{path}', data=data, method=method)
|
||||
for key, value in self.headers.items():
|
||||
req.add_header(key, value)
|
||||
try:
|
||||
with urllib.request.urlopen(req, context=self.ssl_context) as response:
|
||||
body = response.read().decode() if response.length != 0 else ''
|
||||
if response.status not in expected:
|
||||
raise RuntimeError(f'{method} {path} returned {response.status}: {body[:500]}')
|
||||
return json.loads(body) if body else None
|
||||
except urllib.error.HTTPError as exc:
|
||||
body = exc.read().decode()
|
||||
if exc.code not in expected:
|
||||
raise RuntimeError(f'{method} {path} returned {exc.code}: {body[:500]}') from exc
|
||||
return json.loads(body) if body else None
|
||||
|
||||
def get_repo(self, owner: str, repo: str):
|
||||
try:
|
||||
return self.request('GET', f'/api/v1/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(repo)}')
|
||||
except RuntimeError as exc:
|
||||
if ' returned 404:' in str(exc):
|
||||
return None
|
||||
raise
|
||||
|
||||
def create_repo(self, owner: str, name: str, private: bool):
|
||||
payload = {'name': name, 'private': private, 'auto_init': False, 'default_branch': 'main'}
|
||||
if owner == self.username:
|
||||
return self.request('POST', '/api/v1/user/repos', payload, expected=(201,))
|
||||
return self.request('POST', f'/api/v1/orgs/{urllib.parse.quote(owner)}/repos', payload, expected=(201,))
|
||||
|
||||
def upsert_variable(self, owner: str, repo: str, name: str, value: str):
|
||||
path = f'/api/v1/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(repo)}/actions/variables/{urllib.parse.quote(name)}'
|
||||
try:
|
||||
self.request('POST', path, {'value': value}, expected=(201, 204))
|
||||
except RuntimeError as exc:
|
||||
if ' returned 409:' not in str(exc) and ' returned 422:' not in str(exc):
|
||||
raise
|
||||
self.request('PUT', path, {'value': value}, expected=(201, 204))
|
||||
|
||||
def upsert_secret(self, owner: str, repo: str, name: str, value: str):
|
||||
path = f'/api/v1/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(repo)}/actions/secrets/{urllib.parse.quote(name)}'
|
||||
self.request('PUT', path, {'data': value}, expected=(201, 204))
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description='Bootstrap Forgejo Actions settings for orderbooks')
|
||||
parser.add_argument('--forgejo-url', required=True)
|
||||
parser.add_argument('--admin-username')
|
||||
parser.add_argument('--admin-password')
|
||||
parser.add_argument('--token')
|
||||
parser.add_argument('--repo-owner', required=True)
|
||||
parser.add_argument('--repo-name', required=True)
|
||||
parser.add_argument('--repo-private', action='store_true')
|
||||
parser.add_argument('--ci-kubeconfig', required=True)
|
||||
parser.add_argument('--registry-host', required=True)
|
||||
parser.add_argument('--project-name', required=True)
|
||||
parser.add_argument('--project-namespace', required=True)
|
||||
parser.add_argument('--project-deployments', required=True)
|
||||
parser.add_argument('--project-registry-secret-name', required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
client = ForgejoClient(args.forgejo_url, args.admin_username, args.admin_password, args.token)
|
||||
repo = client.get_repo(args.repo_owner, args.repo_name)
|
||||
if repo is None:
|
||||
created = client.create_repo(args.repo_owner, args.repo_name, args.repo_private)
|
||||
print(f'created repo {created["full_name"]}')
|
||||
else:
|
||||
print(f'repo already exists: {repo["full_name"]}')
|
||||
|
||||
kubeconfig_b64 = base64.b64encode(Path(args.ci_kubeconfig).read_bytes()).decode()
|
||||
client.upsert_secret(args.repo_owner, args.repo_name, 'KUBECONFIG_B64', kubeconfig_b64)
|
||||
print('upserted repo action secret KUBECONFIG_B64')
|
||||
|
||||
variables = {
|
||||
'REGISTRY_HOST': args.registry_host,
|
||||
'PROJECT_NAME': args.project_name,
|
||||
'PROJECT_NAMESPACE': args.project_namespace,
|
||||
'PROJECT_DEPLOYMENTS': args.project_deployments,
|
||||
'PROJECT_REGISTRY_SECRET_NAME': args.project_registry_secret_name,
|
||||
}
|
||||
for name, value in variables.items():
|
||||
client.upsert_variable(args.repo_owner, args.repo_name, name, value)
|
||||
print('upserted repo action variables')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
752
scripts/discover_polymarket_btc_markets.py
Executable file
752
scripts/discover_polymarket_btc_markets.py
Executable file
|
|
@ -0,0 +1,752 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Discover active Polymarket BTC up/down markets.
|
||||
|
||||
Checkpoint 3 scope: fetch bounded public Gamma metadata, preserve raw responses,
|
||||
and write normalized market records with outcome-token mappings. This is not an
|
||||
order-book collector.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import datetime as dt
|
||||
import hashlib
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
GAMMA_EVENTS_URL = "https://gamma-api.polymarket.com/events"
|
||||
BTC_TAG_ID = 235
|
||||
|
||||
DEFAULT_OUTPUT_JSON = Path("data/discovery/polymarket_btc_markets_latest.json")
|
||||
DEFAULT_MANIFEST = Path("data/discovery/polymarket_btc_markets_manifest.json")
|
||||
DEFAULT_MARKDOWN = Path("data/discovery/polymarket_btc_markets.md")
|
||||
|
||||
SAFE_RESPONSE_HEADERS = {
|
||||
"age",
|
||||
"cache-control",
|
||||
"cf-cache-status",
|
||||
"cf-ray",
|
||||
"content-encoding",
|
||||
"content-length",
|
||||
"content-type",
|
||||
"date",
|
||||
"expires",
|
||||
"last-modified",
|
||||
"ratelimit-limit",
|
||||
"ratelimit-remaining",
|
||||
"ratelimit-reset",
|
||||
"retry-after",
|
||||
"server",
|
||||
"strict-transport-security",
|
||||
"x-ratelimit-limit",
|
||||
"x-ratelimit-remaining",
|
||||
"x-ratelimit-reset",
|
||||
}
|
||||
|
||||
FILTER_RULES = [
|
||||
"Use public Gamma /events with tag_id=235, related_tags=true, active=true, closed=false.",
|
||||
"Require event.active=true and event.closed=false.",
|
||||
"Require market.active=true and market.closed=false.",
|
||||
"Require market.enableOrderBook=true.",
|
||||
"Require market.acceptingOrders=true unless --allow-non-accepting-orders is used.",
|
||||
"Require market end time to be after the fetch time unless --allow-expired is used.",
|
||||
"Require outcomes to resolve to exactly Up and Down.",
|
||||
"Require clobTokenIds to resolve to exactly two token IDs.",
|
||||
"Require BTC/up-down evidence from seriesSlug, title/slug text, or tags.",
|
||||
]
|
||||
|
||||
|
||||
def utc_now() -> dt.datetime:
|
||||
return dt.datetime.now(dt.UTC)
|
||||
|
||||
|
||||
def iso_z(value: dt.datetime | None = None) -> str:
|
||||
value = value or utc_now()
|
||||
return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
||||
|
||||
|
||||
def parse_iso(value: Any) -> dt.datetime | None:
|
||||
if not isinstance(value, str) or not value.strip():
|
||||
return None
|
||||
text = value.strip()
|
||||
if text.endswith("Z"):
|
||||
text = text[:-1] + "+00:00"
|
||||
try:
|
||||
parsed = dt.datetime.fromisoformat(text)
|
||||
except ValueError:
|
||||
return None
|
||||
if parsed.tzinfo is None:
|
||||
parsed = parsed.replace(tzinfo=dt.UTC)
|
||||
return parsed.astimezone(dt.UTC)
|
||||
|
||||
|
||||
def sha256_file(path: Path) -> str:
|
||||
digest = hashlib.sha256()
|
||||
with path.open("rb") as handle:
|
||||
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
||||
digest.update(chunk)
|
||||
return digest.hexdigest()
|
||||
|
||||
|
||||
def filter_headers(headers: Any) -> dict[str, str]:
|
||||
safe: dict[str, str] = {}
|
||||
for key, value in dict(headers).items():
|
||||
if key.lower() in SAFE_RESPONSE_HEADERS:
|
||||
safe[key] = value
|
||||
return safe
|
||||
|
||||
|
||||
def normalize_params(params: dict[str, Any]) -> dict[str, Any]:
|
||||
normalized: dict[str, Any] = {}
|
||||
for key, value in params.items():
|
||||
if isinstance(value, bool):
|
||||
normalized[key] = "true" if value else "false"
|
||||
else:
|
||||
normalized[key] = value
|
||||
return normalized
|
||||
|
||||
|
||||
def build_url(url: str, params: dict[str, Any]) -> str:
|
||||
query = urllib.parse.urlencode(normalize_params(params), doseq=True)
|
||||
return f"{url}?{query}"
|
||||
|
||||
|
||||
def fetch_json_page(
|
||||
*,
|
||||
name: str,
|
||||
url: str,
|
||||
params: dict[str, Any],
|
||||
timeout_seconds: float,
|
||||
) -> dict[str, Any]:
|
||||
started_monotonic = time.monotonic()
|
||||
started_at_utc = iso_z()
|
||||
full_url = build_url(url, params)
|
||||
request = urllib.request.Request(
|
||||
full_url,
|
||||
headers={
|
||||
"Accept": "application/json",
|
||||
"User-Agent": "orderbooks-checkpoint-3-discovery/1.0",
|
||||
},
|
||||
method="GET",
|
||||
)
|
||||
status_code: int | None = None
|
||||
response_headers: dict[str, str] = {}
|
||||
response_text = ""
|
||||
error: str | None = None
|
||||
try:
|
||||
with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
|
||||
status_code = response.status
|
||||
response_headers = filter_headers(response.headers)
|
||||
response_text = response.read().decode("utf-8", errors="replace")
|
||||
except urllib.error.HTTPError as exc:
|
||||
status_code = exc.code
|
||||
response_headers = filter_headers(exc.headers)
|
||||
response_text = exc.read().decode("utf-8", errors="replace")
|
||||
error = f"HTTPError: {exc}"
|
||||
except Exception as exc: # noqa: BLE001 - preserve probe failure evidence
|
||||
error = f"{type(exc).__name__}: {exc}"
|
||||
|
||||
response_json: Any | None = None
|
||||
json_error: str | None = None
|
||||
if response_text:
|
||||
try:
|
||||
response_json = json.loads(response_text)
|
||||
except json.JSONDecodeError as exc:
|
||||
json_error = str(exc)
|
||||
|
||||
return {
|
||||
"name": name,
|
||||
"started_at_utc": started_at_utc,
|
||||
"ended_at_utc": iso_z(),
|
||||
"duration_ms": round((time.monotonic() - started_monotonic) * 1000, 3),
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": url,
|
||||
"full_url": full_url,
|
||||
"params": normalize_params(params),
|
||||
},
|
||||
"response": {
|
||||
"status_code": status_code,
|
||||
"headers": response_headers,
|
||||
"json": response_json,
|
||||
"json_error": json_error,
|
||||
"text_preview": response_text[:1000] if response_json is None else None,
|
||||
},
|
||||
"ok": error is None and status_code is not None and 200 <= status_code < 300,
|
||||
"error": error,
|
||||
}
|
||||
|
||||
|
||||
def coerce_json_array(value: Any) -> list[Any]:
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
parsed = json.loads(value)
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
return parsed if isinstance(parsed, list) else []
|
||||
return []
|
||||
|
||||
|
||||
def lower_text(value: Any) -> str:
|
||||
return str(value or "").lower()
|
||||
|
||||
|
||||
def event_tag_text(event: dict[str, Any]) -> str:
|
||||
parts: list[str] = []
|
||||
for tag in event.get("tags") or []:
|
||||
if isinstance(tag, dict):
|
||||
parts.append(str(tag.get("slug") or ""))
|
||||
parts.append(str(tag.get("label") or ""))
|
||||
return " ".join(parts).lower()
|
||||
|
||||
|
||||
def has_btc_up_down_evidence(event: dict[str, Any], market: dict[str, Any]) -> bool:
|
||||
series_slug = lower_text(event.get("seriesSlug"))
|
||||
text = " ".join(
|
||||
lower_text(event.get(key))
|
||||
for key in ("title", "slug", "ticker", "description")
|
||||
)
|
||||
text += " " + " ".join(
|
||||
lower_text(market.get(key))
|
||||
for key in ("question", "slug", "description")
|
||||
)
|
||||
tags = event_tag_text(event)
|
||||
series_match = series_slug.startswith("btc-up-or-down")
|
||||
text_match = ("bitcoin" in text or "btc" in text) and "up" in text and "down" in text
|
||||
tag_match = ("bitcoin" in tags or "btc" in tags) and "up-or-down" in tags
|
||||
return bool(series_match or text_match or tag_match)
|
||||
|
||||
|
||||
def is_up_down_outcomes(outcomes: list[str]) -> bool:
|
||||
return len(outcomes) == 2 and {item.lower() for item in outcomes} == {"up", "down"}
|
||||
|
||||
|
||||
def normalize_market(
|
||||
*,
|
||||
event: dict[str, Any],
|
||||
market: dict[str, Any],
|
||||
page_index: int,
|
||||
event_index: int,
|
||||
market_index: int,
|
||||
fetched_at_utc: str,
|
||||
output_json_path: Path,
|
||||
) -> dict[str, Any]:
|
||||
outcomes = [str(item) for item in coerce_json_array(market.get("outcomes"))]
|
||||
token_ids = [str(item) for item in coerce_json_array(market.get("clobTokenIds"))]
|
||||
tokens = [
|
||||
{
|
||||
"outcome": outcomes[index],
|
||||
"token_id": token_ids[index],
|
||||
"outcome_index": index,
|
||||
}
|
||||
for index in range(min(len(outcomes), len(token_ids)))
|
||||
]
|
||||
start_time = (
|
||||
market.get("startDate")
|
||||
or market.get("startDateIso")
|
||||
or event.get("startDate")
|
||||
or event.get("creationDate")
|
||||
)
|
||||
end_time = market.get("endDate") or market.get("endDateIso") or event.get("endDate")
|
||||
event_slug = event.get("slug")
|
||||
market_slug = market.get("slug") or event_slug
|
||||
return {
|
||||
"market_name": "polymarket",
|
||||
"market_slug": market_slug,
|
||||
"event_slug": event_slug,
|
||||
"title": event.get("title") or market.get("question"),
|
||||
"question": market.get("question") or event.get("title"),
|
||||
"condition_id": market.get("conditionId"),
|
||||
"tokens": tokens,
|
||||
"outcomes": outcomes,
|
||||
"start_time_utc": iso_z(parse_iso(start_time)) if parse_iso(start_time) else start_time,
|
||||
"end_time_utc": iso_z(parse_iso(end_time)) if parse_iso(end_time) else end_time,
|
||||
"active": market.get("active"),
|
||||
"closed": market.get("closed"),
|
||||
"event_active": event.get("active"),
|
||||
"event_closed": event.get("closed"),
|
||||
"accepting_orders": market.get("acceptingOrders"),
|
||||
"enable_order_book": market.get("enableOrderBook"),
|
||||
"endpoint_source": {
|
||||
"name": "gamma_events_bitcoin_tag",
|
||||
"method": "GET",
|
||||
"url": GAMMA_EVENTS_URL,
|
||||
"params_basis": {
|
||||
"tag_id": BTC_TAG_ID,
|
||||
"related_tags": "true",
|
||||
"active": "true",
|
||||
"closed": "false",
|
||||
"order": "endDate",
|
||||
"ascending": "true",
|
||||
},
|
||||
},
|
||||
"fetched_at_utc": fetched_at_utc,
|
||||
"raw_ref": {
|
||||
"artifact_path": output_json_path.as_posix(),
|
||||
"section": "raw.gamma_events_pages",
|
||||
"page_index": page_index,
|
||||
"event_index": event_index,
|
||||
"market_index": market_index,
|
||||
"json_path": f"raw.gamma_events_pages[{page_index}].response.json[{event_index}].markets[{market_index}]",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def rejection_reasons(
|
||||
*,
|
||||
event: dict[str, Any],
|
||||
market: dict[str, Any],
|
||||
fetched_at: dt.datetime,
|
||||
require_accepting_orders: bool,
|
||||
require_future_end: bool,
|
||||
) -> list[str]:
|
||||
reasons: list[str] = []
|
||||
outcomes = [str(item) for item in coerce_json_array(market.get("outcomes"))]
|
||||
token_ids = [str(item) for item in coerce_json_array(market.get("clobTokenIds"))]
|
||||
end_time = parse_iso(market.get("endDate") or event.get("endDate"))
|
||||
|
||||
if event.get("active") is not True:
|
||||
reasons.append("event_not_active")
|
||||
if event.get("closed") is not False:
|
||||
reasons.append("event_closed")
|
||||
if market.get("active") is not True:
|
||||
reasons.append("market_not_active")
|
||||
if market.get("closed") is not False:
|
||||
reasons.append("market_closed")
|
||||
if market.get("enableOrderBook") is not True:
|
||||
reasons.append("order_book_not_enabled")
|
||||
if require_accepting_orders and market.get("acceptingOrders") is not True:
|
||||
reasons.append("not_accepting_orders")
|
||||
if require_future_end and (end_time is None or end_time <= fetched_at):
|
||||
reasons.append("not_future_end")
|
||||
if not is_up_down_outcomes(outcomes):
|
||||
reasons.append("not_up_down_outcomes")
|
||||
if len(token_ids) != 2:
|
||||
reasons.append("missing_two_clob_token_ids")
|
||||
if not has_btc_up_down_evidence(event, market):
|
||||
reasons.append("missing_btc_up_down_evidence")
|
||||
return reasons
|
||||
|
||||
|
||||
def discover(args: argparse.Namespace) -> dict[str, Any]:
|
||||
started_at_utc = iso_z()
|
||||
fetched_at = utc_now()
|
||||
fetched_at_utc = iso_z(fetched_at)
|
||||
raw_pages: list[dict[str, Any]] = []
|
||||
normalized: list[dict[str, Any]] = []
|
||||
rejected_counts: dict[str, int] = {}
|
||||
warnings: list[str] = []
|
||||
seen_conditions: set[str] = set()
|
||||
|
||||
for page_index in range(args.max_pages):
|
||||
offset = page_index * args.limit
|
||||
params = {
|
||||
"tag_id": BTC_TAG_ID,
|
||||
"related_tags": True,
|
||||
"active": True,
|
||||
"closed": False,
|
||||
"limit": args.limit,
|
||||
"offset": offset,
|
||||
"order": "endDate",
|
||||
"ascending": True,
|
||||
}
|
||||
page = fetch_json_page(
|
||||
name=f"gamma_events_bitcoin_tag_page_{page_index}",
|
||||
url=GAMMA_EVENTS_URL,
|
||||
params=params,
|
||||
timeout_seconds=args.timeout,
|
||||
)
|
||||
raw_pages.append(page)
|
||||
payload = page["response"]["json"]
|
||||
if not page["ok"]:
|
||||
warnings.append(
|
||||
f"Page {page_index} request failed with status {page['response']['status_code']}: {page['error']}"
|
||||
)
|
||||
break
|
||||
if not isinstance(payload, list):
|
||||
warnings.append(f"Page {page_index} response was not a JSON list.")
|
||||
break
|
||||
|
||||
for event_index, event in enumerate(payload):
|
||||
if not isinstance(event, dict):
|
||||
rejected_counts["event_not_object"] = rejected_counts.get("event_not_object", 0) + 1
|
||||
continue
|
||||
markets = event.get("markets") or []
|
||||
if not isinstance(markets, list) or not markets:
|
||||
rejected_counts["missing_markets"] = rejected_counts.get("missing_markets", 0) + 1
|
||||
continue
|
||||
for market_index, market in enumerate(markets):
|
||||
if not isinstance(market, dict):
|
||||
rejected_counts["market_not_object"] = rejected_counts.get("market_not_object", 0) + 1
|
||||
continue
|
||||
reasons = rejection_reasons(
|
||||
event=event,
|
||||
market=market,
|
||||
fetched_at=fetched_at,
|
||||
require_accepting_orders=not args.allow_non_accepting_orders,
|
||||
require_future_end=not args.allow_expired,
|
||||
)
|
||||
if reasons:
|
||||
for reason in reasons:
|
||||
rejected_counts[reason] = rejected_counts.get(reason, 0) + 1
|
||||
continue
|
||||
condition_id = str(market.get("conditionId") or "")
|
||||
if condition_id in seen_conditions:
|
||||
rejected_counts["duplicate_condition_id"] = rejected_counts.get(
|
||||
"duplicate_condition_id", 0
|
||||
) + 1
|
||||
continue
|
||||
seen_conditions.add(condition_id)
|
||||
normalized.append(
|
||||
normalize_market(
|
||||
event=event,
|
||||
market=market,
|
||||
page_index=page_index,
|
||||
event_index=event_index,
|
||||
market_index=market_index,
|
||||
fetched_at_utc=fetched_at_utc,
|
||||
output_json_path=args.output_json,
|
||||
)
|
||||
)
|
||||
|
||||
if len(payload) < args.limit:
|
||||
break
|
||||
|
||||
normalized.sort(key=lambda item: (item.get("end_time_utc") or "", item.get("market_slug") or ""))
|
||||
if raw_pages:
|
||||
last_payload = raw_pages[-1]["response"].get("json")
|
||||
if isinstance(last_payload, list) and len(last_payload) == args.limit and len(raw_pages) >= args.max_pages:
|
||||
warnings.append(
|
||||
"Discovery stopped at max_pages before exhausting Gamma pagination; output is bounded to the fetched pages."
|
||||
)
|
||||
if len(normalized) < args.min_markets:
|
||||
warnings.append(
|
||||
f"Only {len(normalized)} markets passed filters; min_markets={args.min_markets}."
|
||||
)
|
||||
|
||||
status = "PASS" if len(normalized) >= args.min_markets else "FAIL"
|
||||
status_reason = (
|
||||
f"Discovered {len(normalized)} active BTC up/down markets with condition IDs and two token IDs."
|
||||
if status == "PASS"
|
||||
else "Did not discover enough active BTC up/down markets with condition IDs and two token IDs."
|
||||
)
|
||||
return {
|
||||
"schema_name": "polymarket_btc_market_discovery",
|
||||
"schema_version": 1,
|
||||
"artifact_status": "valid" if status == "PASS" else "partial",
|
||||
"checkpoint_id": 3,
|
||||
"checkpoint_name": "Minimal BTC Market Discovery",
|
||||
"started_at_utc": started_at_utc,
|
||||
"ended_at_utc": iso_z(),
|
||||
"fetched_at_utc": fetched_at_utc,
|
||||
"scope": "Bounded public Gamma metadata discovery only; no order-book collector.",
|
||||
"endpoint_basis": {
|
||||
"source_checkpoint": "Checkpoint 2",
|
||||
"source_report": "reports/checkpoints/checkpoint_002_polymarket_public_sources.md",
|
||||
"endpoint": GAMMA_EVENTS_URL,
|
||||
"method": "GET",
|
||||
"base_params": {
|
||||
"tag_id": BTC_TAG_ID,
|
||||
"related_tags": True,
|
||||
"active": True,
|
||||
"closed": False,
|
||||
"limit": args.limit,
|
||||
"order": "endDate",
|
||||
"ascending": True,
|
||||
},
|
||||
},
|
||||
"filter_rules": FILTER_RULES,
|
||||
"normalized_markets": normalized,
|
||||
"raw": {
|
||||
"gamma_events_pages": raw_pages,
|
||||
},
|
||||
"summary": {
|
||||
"status": status,
|
||||
"status_reason": status_reason,
|
||||
"raw_pages_fetched": len(raw_pages),
|
||||
"raw_events_fetched": sum(
|
||||
len(page["response"].get("json") or [])
|
||||
for page in raw_pages
|
||||
if isinstance(page["response"].get("json"), list)
|
||||
),
|
||||
"normalized_market_count": len(normalized),
|
||||
"rejected_counts": dict(sorted(rejected_counts.items())),
|
||||
"warnings": warnings,
|
||||
},
|
||||
"fake_progress_risk": "Discovery can appear successful while silently missing markets if filters rely on stale text assumptions or bounded pagination. Raw pages and rejection counts are preserved so missed-market risk can be audited.",
|
||||
"next_step": "Checkpoint 4 should use this discovery output as input for a short, raw-first order-book snapshot sample; do not claim reliability until the later 24h soak test.",
|
||||
}
|
||||
|
||||
|
||||
def markdown_table_row(values: list[Any]) -> str:
|
||||
return "| " + " | ".join(str(value).replace("\n", " ") for value in values) + " |"
|
||||
|
||||
|
||||
def write_markdown(discovery: dict[str, Any], path: Path) -> None:
|
||||
summary = discovery["summary"]
|
||||
rows = discovery["normalized_markets"]
|
||||
lines = [
|
||||
"# Polymarket BTC Markets Discovery",
|
||||
"",
|
||||
f"Artifact status: `{discovery['artifact_status']}`",
|
||||
"",
|
||||
"## Gate",
|
||||
"",
|
||||
f"Status: `{summary['status']}`",
|
||||
"",
|
||||
summary["status_reason"],
|
||||
"",
|
||||
"## Scope",
|
||||
"",
|
||||
"Bounded public Gamma metadata discovery only. No order-book collection, no trading, no private endpoints, no secrets.",
|
||||
"",
|
||||
"## Endpoint",
|
||||
"",
|
||||
f"- `GET {GAMMA_EVENTS_URL}`",
|
||||
"- Params: `tag_id=235`, `related_tags=true`, `active=true`, `closed=false`, `order=endDate`, `ascending=true`, bounded by `limit` and `max_pages`.",
|
||||
"",
|
||||
"## Summary",
|
||||
"",
|
||||
markdown_table_row(["Metric", "Value"]),
|
||||
markdown_table_row(["---", "---"]),
|
||||
markdown_table_row(["fetched_at_utc", discovery["fetched_at_utc"]]),
|
||||
markdown_table_row(["raw_pages_fetched", summary["raw_pages_fetched"]]),
|
||||
markdown_table_row(["raw_events_fetched", summary["raw_events_fetched"]]),
|
||||
markdown_table_row(["normalized_market_count", summary["normalized_market_count"]]),
|
||||
"",
|
||||
"## Markets",
|
||||
"",
|
||||
markdown_table_row(
|
||||
[
|
||||
"market_slug",
|
||||
"end_time_utc",
|
||||
"condition_id",
|
||||
"outcomes",
|
||||
"token_ids",
|
||||
"accepting_orders",
|
||||
]
|
||||
),
|
||||
markdown_table_row(["---", "---", "---", "---", "---", "---"]),
|
||||
]
|
||||
for row in rows:
|
||||
token_ids = [token["token_id"] for token in row["tokens"]]
|
||||
lines.append(
|
||||
markdown_table_row(
|
||||
[
|
||||
row.get("market_slug"),
|
||||
row.get("end_time_utc"),
|
||||
row.get("condition_id"),
|
||||
json.dumps(row.get("outcomes")),
|
||||
json.dumps(token_ids),
|
||||
row.get("accepting_orders"),
|
||||
]
|
||||
)
|
||||
)
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"## Warnings",
|
||||
"",
|
||||
]
|
||||
)
|
||||
if summary["warnings"]:
|
||||
for warning in summary["warnings"]:
|
||||
lines.append(f"- {warning}")
|
||||
else:
|
||||
lines.append("- None.")
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"## Rejection Counts",
|
||||
"",
|
||||
"```json",
|
||||
json.dumps(summary["rejected_counts"], indent=2, sort_keys=True),
|
||||
"```",
|
||||
"",
|
||||
"## Raw Preservation",
|
||||
"",
|
||||
"The latest JSON artifact stores raw Gamma response envelopes under `raw.gamma_events_pages`. Each normalized record has a `raw_ref` pointing back to the source event market.",
|
||||
"",
|
||||
"## Strongest Fake-Progress Risk",
|
||||
"",
|
||||
discovery["fake_progress_risk"],
|
||||
"",
|
||||
"## Next Smallest Step",
|
||||
"",
|
||||
discovery["next_step"],
|
||||
"",
|
||||
]
|
||||
)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
|
||||
def write_manifest(
|
||||
*,
|
||||
discovery: dict[str, Any],
|
||||
manifest_path: Path,
|
||||
output_json: Path,
|
||||
markdown_path: Path,
|
||||
command: str,
|
||||
) -> None:
|
||||
status = discovery["summary"]["status"]
|
||||
output_files = [
|
||||
{
|
||||
"path": output_json.as_posix(),
|
||||
"kind": "latest_discovery_json",
|
||||
"status": "valid" if output_json.exists() and output_json.stat().st_size else "missing",
|
||||
"sha256": sha256_file(output_json) if output_json.exists() else None,
|
||||
},
|
||||
{
|
||||
"path": markdown_path.as_posix(),
|
||||
"kind": "discovery_markdown",
|
||||
"status": "valid" if markdown_path.exists() and markdown_path.stat().st_size else "missing",
|
||||
"sha256": sha256_file(markdown_path) if markdown_path.exists() else None,
|
||||
},
|
||||
]
|
||||
script_path = Path("scripts/discover_polymarket_btc_markets.py")
|
||||
if script_path.exists():
|
||||
output_files.append(
|
||||
{
|
||||
"path": script_path.as_posix(),
|
||||
"kind": "discovery_script",
|
||||
"status": "valid",
|
||||
"sha256": sha256_file(script_path),
|
||||
}
|
||||
)
|
||||
status_codes: dict[str, int] = {}
|
||||
for page in discovery["raw"]["gamma_events_pages"]:
|
||||
code = str(page["response"].get("status_code"))
|
||||
status_codes[code] = status_codes.get(code, 0) + 1
|
||||
|
||||
manifest = {
|
||||
"schema_name": "polymarket_btc_markets_manifest",
|
||||
"schema_version": 1,
|
||||
"checkpoint_id": 3,
|
||||
"checkpoint_name": "Minimal BTC Market Discovery",
|
||||
"status": status,
|
||||
"started_at_utc": discovery["started_at_utc"],
|
||||
"ended_at_utc": discovery["ended_at_utc"],
|
||||
"scope": discovery["scope"],
|
||||
"command": command,
|
||||
"endpoint": discovery["endpoint_basis"],
|
||||
"request_counts": {
|
||||
"gamma_events_pages": discovery["summary"]["raw_pages_fetched"],
|
||||
"status_code_counts": dict(sorted(status_codes.items())),
|
||||
},
|
||||
"row_counts": {
|
||||
"raw_events_fetched": discovery["summary"]["raw_events_fetched"],
|
||||
"normalized_markets": discovery["summary"]["normalized_market_count"],
|
||||
},
|
||||
"market_ids": [
|
||||
{
|
||||
"market_slug": row.get("market_slug"),
|
||||
"condition_id": row.get("condition_id"),
|
||||
"token_ids": [token.get("token_id") for token in row.get("tokens", [])],
|
||||
}
|
||||
for row in discovery["normalized_markets"]
|
||||
],
|
||||
"output_files": output_files,
|
||||
"warnings": discovery["summary"]["warnings"],
|
||||
"validation": {
|
||||
"summary": discovery["summary"]["status_reason"],
|
||||
"required_record_fields": [
|
||||
"market_name",
|
||||
"market_slug",
|
||||
"question",
|
||||
"condition_id",
|
||||
"tokens",
|
||||
"outcomes",
|
||||
"start_time_utc",
|
||||
"end_time_utc",
|
||||
"active",
|
||||
"closed",
|
||||
"accepting_orders",
|
||||
"enable_order_book",
|
||||
"endpoint_source",
|
||||
"fetched_at_utc",
|
||||
"raw_ref",
|
||||
],
|
||||
},
|
||||
"fake_progress_risk": discovery["fake_progress_risk"],
|
||||
"next_step": discovery["next_step"],
|
||||
}
|
||||
manifest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def write_outputs(args: argparse.Namespace, discovery: dict[str, Any]) -> None:
|
||||
args.output_json.parent.mkdir(parents=True, exist_ok=True)
|
||||
args.output_json.write_text(
|
||||
json.dumps(discovery, indent=2, sort_keys=True) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
write_markdown(discovery, args.markdown)
|
||||
command = " ".join([Path(sys.argv[0]).as_posix(), *sys.argv[1:]])
|
||||
write_manifest(
|
||||
discovery=discovery,
|
||||
manifest_path=args.manifest,
|
||||
output_json=args.output_json,
|
||||
markdown_path=args.markdown,
|
||||
command=command,
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Discover active BTC up/down Polymarket markets from public Gamma metadata."
|
||||
)
|
||||
parser.add_argument("--output-json", type=Path, default=DEFAULT_OUTPUT_JSON)
|
||||
parser.add_argument("--manifest", type=Path, default=DEFAULT_MANIFEST)
|
||||
parser.add_argument("--markdown", type=Path, default=DEFAULT_MARKDOWN)
|
||||
parser.add_argument("--limit", type=int, default=100)
|
||||
parser.add_argument("--max-pages", type=int, default=3)
|
||||
parser.add_argument("--timeout", type=float, default=15.0)
|
||||
parser.add_argument("--min-markets", type=int, default=1)
|
||||
parser.add_argument("--allow-expired", action="store_true")
|
||||
parser.add_argument("--allow-non-accepting-orders", action="store_true")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
discovery = discover(args)
|
||||
write_outputs(args, discovery)
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"status": discovery["summary"]["status"],
|
||||
"status_reason": discovery["summary"]["status_reason"],
|
||||
"output_json": args.output_json.as_posix(),
|
||||
"manifest": args.manifest.as_posix(),
|
||||
"markdown": args.markdown.as_posix(),
|
||||
"normalized_market_count": discovery["summary"]["normalized_market_count"],
|
||||
"markets": [
|
||||
{
|
||||
"market_slug": row.get("market_slug"),
|
||||
"condition_id": row.get("condition_id"),
|
||||
"token_ids": [token.get("token_id") for token in row.get("tokens", [])],
|
||||
"end_time_utc": row.get("end_time_utc"),
|
||||
}
|
||||
for row in discovery["normalized_markets"]
|
||||
],
|
||||
"warnings": discovery["summary"]["warnings"],
|
||||
},
|
||||
indent=2,
|
||||
sort_keys=True,
|
||||
)
|
||||
)
|
||||
return 0 if discovery["summary"]["status"] == "PASS" else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
466
scripts/k8s_runtime_smoke_check.sh
Executable file
466
scripts/k8s_runtime_smoke_check.sh
Executable file
|
|
@ -0,0 +1,466 @@
|
|||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
NAMESPACE="${ORDERBOOKS_K8S_NAMESPACE:-orderbooks}"
|
||||
DEPLOYMENT="${ORDERBOOKS_K8S_COLLECTOR_DEPLOYMENT:-orderbooks-collector}"
|
||||
CRONJOB="${ORDERBOOKS_K8S_UPLOADER_CRONJOB:-orderbooks-uploader}"
|
||||
RAW_DIR="${ORDERBOOKS_K8S_RAW_DIR:-/var/lib/orderbooks/raw_orderbooks}"
|
||||
MANIFEST_DIR="${ORDERBOOKS_K8S_MANIFEST_DIR:-/var/lib/orderbooks/manifests}"
|
||||
WAIT_SECONDS="${ORDERBOOKS_K8S_SMOKE_WAIT_SECONDS:-1200}"
|
||||
UPLOAD_MIN_AGE_SECONDS="${ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS:-600}"
|
||||
KUBECTL_BIN="${ORDERBOOKS_KUBECTL:-kubectl}"
|
||||
RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
|
||||
EVIDENCE_PATH="${ORDERBOOKS_K8S_SMOKE_EVIDENCE_PATH:-data/manifests/k8s_runtime_smoke_${RUN_ID}.json}"
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: scripts/k8s_runtime_smoke_check.sh [options]
|
||||
|
||||
Run after the orderbooks Kubernetes workload is deployed. The script uses
|
||||
kubectl, writes local JSON evidence, deletes one collector pod to force a
|
||||
Deployment restart, verifies raw gzip JSONL files and manifests on the PVC,
|
||||
then triggers the uploader CronJob and requires a verified upload manifest.
|
||||
|
||||
Options:
|
||||
--namespace NAME Namespace. Default: orderbooks.
|
||||
--deployment NAME Collector deployment. Default: orderbooks-collector.
|
||||
--cronjob NAME Uploader CronJob. Default: orderbooks-uploader.
|
||||
--raw-dir PATH Raw path inside collector pod. Default: /var/lib/orderbooks/raw_orderbooks.
|
||||
--manifest-dir PATH Manifest path inside collector pod. Default: /var/lib/orderbooks/manifests.
|
||||
--wait-seconds N Max wait for collector/upload evidence. Default: 1200.
|
||||
--upload-min-age-seconds N
|
||||
Wait for at least one raw/manifest file to be this old before upload. Default: 600.
|
||||
--evidence-path PATH Local JSON evidence path.
|
||||
--kubectl PATH kubectl binary. Default: kubectl.
|
||||
--help Show this help.
|
||||
|
||||
This script does not read or print rclone config contents.
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--namespace) NAMESPACE="$2"; shift 2 ;;
|
||||
--deployment) DEPLOYMENT="$2"; shift 2 ;;
|
||||
--cronjob) CRONJOB="$2"; shift 2 ;;
|
||||
--raw-dir) RAW_DIR="$2"; shift 2 ;;
|
||||
--manifest-dir) MANIFEST_DIR="$2"; shift 2 ;;
|
||||
--wait-seconds) WAIT_SECONDS="$2"; shift 2 ;;
|
||||
--upload-min-age-seconds) UPLOAD_MIN_AGE_SECONDS="$2"; shift 2 ;;
|
||||
--evidence-path) EVIDENCE_PATH="$2"; shift 2 ;;
|
||||
--kubectl) KUBECTL_BIN="$2"; shift 2 ;;
|
||||
--help) usage; exit 0 ;;
|
||||
*) echo "Unknown argument: $1" >&2; usage >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
mkdir -p "$(dirname "${EVIDENCE_PATH}")"
|
||||
|
||||
PYTHONDONTWRITEBYTECODE=1 python3 - "$KUBECTL_BIN" "$NAMESPACE" "$DEPLOYMENT" "$CRONJOB" "$RAW_DIR" "$MANIFEST_DIR" "$WAIT_SECONDS" "$UPLOAD_MIN_AGE_SECONDS" "$EVIDENCE_PATH" <<'PY_SMOKE'
|
||||
import datetime as dt
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
kubectl = sys.argv[1]
|
||||
namespace = sys.argv[2]
|
||||
deployment = sys.argv[3]
|
||||
cronjob = sys.argv[4]
|
||||
raw_dir = sys.argv[5]
|
||||
manifest_dir = sys.argv[6]
|
||||
wait_seconds = int(sys.argv[7])
|
||||
upload_min_age_seconds = int(sys.argv[8])
|
||||
evidence_path = Path(sys.argv[9])
|
||||
started_at = dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z')
|
||||
checks = []
|
||||
failures = []
|
||||
|
||||
def iso_now():
|
||||
return dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z')
|
||||
|
||||
|
||||
def capture(command, input_text=None, timeout=None):
|
||||
proc = subprocess.run(command, input=input_text, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
|
||||
item = {
|
||||
'command': command,
|
||||
'exit_code': proc.returncode,
|
||||
'stdout_tail': proc.stdout[-6000:],
|
||||
'stderr_tail': proc.stderr[-6000:],
|
||||
'ran_at_utc': iso_now(),
|
||||
}
|
||||
checks.append(item)
|
||||
return proc, item
|
||||
|
||||
|
||||
def run(command, input_text=None, timeout=None):
|
||||
_proc, item = capture(command, input_text=input_text, timeout=timeout)
|
||||
return item
|
||||
|
||||
|
||||
def run_json(command, input_text=None, timeout=None):
|
||||
proc, item = capture(command, input_text=input_text, timeout=timeout)
|
||||
if item['exit_code'] != 0:
|
||||
raise RuntimeError(f"command failed: {' '.join(command)}")
|
||||
return json.loads(proc.stdout)
|
||||
|
||||
|
||||
def pod_ready(pod):
|
||||
if pod.get('status', {}).get('phase') != 'Running':
|
||||
return False
|
||||
statuses = pod.get('status', {}).get('containerStatuses') or []
|
||||
return bool(statuses) and all(status.get('ready') for status in statuses)
|
||||
|
||||
|
||||
def get_collector_pod():
|
||||
selector = 'app.kubernetes.io/name=orderbooks,app.kubernetes.io/component=collector'
|
||||
deadline = time.time() + wait_seconds
|
||||
last = None
|
||||
while time.time() <= deadline:
|
||||
pods = run_json([kubectl, '-n', namespace, 'get', 'pods', '-l', selector, '-o', 'json'])
|
||||
items = pods.get('items', [])
|
||||
ready = [pod for pod in items if pod_ready(pod)]
|
||||
if ready:
|
||||
ready.sort(key=lambda pod: pod.get('metadata', {}).get('creationTimestamp', ''))
|
||||
return ready[-1]['metadata']['name'], ready[-1]
|
||||
last = items
|
||||
time.sleep(10)
|
||||
raise TimeoutError(f'no ready collector pod found; last pods={last}')
|
||||
|
||||
|
||||
def exec_python(pod, code, args):
|
||||
command = [kubectl, '-n', namespace, 'exec', '-i', pod, '--', 'python3', '-', *args]
|
||||
proc, item = capture(command, input_text=code, timeout=wait_seconds + 60)
|
||||
if item['exit_code'] != 0:
|
||||
raise RuntimeError(f"pod python command failed in {pod}: {item['stderr_tail']}")
|
||||
return json.loads(proc.stdout)
|
||||
|
||||
|
||||
def wait_for_valid_collector(pod, after_mtime, label):
|
||||
deadline = time.time() + wait_seconds
|
||||
last_error = None
|
||||
while time.time() <= deadline:
|
||||
try:
|
||||
result = exec_python(pod, collector_validation_code, [manifest_dir, raw_dir, str(after_mtime)])
|
||||
if result.get('valid'):
|
||||
result['wait_label'] = label
|
||||
return result
|
||||
last_error = result
|
||||
except Exception as exc:
|
||||
last_error = repr(exc)
|
||||
time.sleep(15)
|
||||
raise TimeoutError(f'no valid {label} collector manifest found before timeout: {last_error}')
|
||||
|
||||
|
||||
def wait_for_upload_eligible_files(pod):
|
||||
deadline = time.time() + wait_seconds
|
||||
last = None
|
||||
while time.time() <= deadline:
|
||||
result = exec_python(pod, upload_eligibility_code, [raw_dir, manifest_dir, str(upload_min_age_seconds)])
|
||||
if result.get('eligible'):
|
||||
return result
|
||||
last = result
|
||||
time.sleep(15)
|
||||
raise TimeoutError(f'no upload-eligible raw/manifest files before timeout: {last}')
|
||||
|
||||
collector_validation_code = r'''
|
||||
import gzip
|
||||
import hashlib
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
manifest_dir = Path(sys.argv[1])
|
||||
raw_dir = Path(sys.argv[2])
|
||||
after_mtime = float(sys.argv[3])
|
||||
|
||||
def sha256(path):
|
||||
digest = hashlib.sha256()
|
||||
with path.open('rb') as handle:
|
||||
for chunk in iter(lambda: handle.read(1024 * 1024), b''):
|
||||
digest.update(chunk)
|
||||
return digest.hexdigest()
|
||||
|
||||
|
||||
def parse_raw(path):
|
||||
rows = 0
|
||||
first_keys = []
|
||||
with gzip.open(path, 'rt', encoding='utf-8') as handle:
|
||||
for line in handle:
|
||||
if not line.strip():
|
||||
continue
|
||||
obj = json.loads(line)
|
||||
if rows == 0:
|
||||
first_keys = sorted(obj.keys())
|
||||
rows += 1
|
||||
return rows, first_keys
|
||||
|
||||
|
||||
def validate(path):
|
||||
manifest = json.loads(path.read_text(encoding='utf-8'))
|
||||
output_files = []
|
||||
for item in manifest.get('output_files', []):
|
||||
raw_path = Path(item['path'])
|
||||
rows, first_keys = parse_raw(raw_path)
|
||||
actual_sha = sha256(raw_path)
|
||||
output_files.append({
|
||||
'path': str(raw_path),
|
||||
'bytes': raw_path.stat().st_size,
|
||||
'mtime': raw_path.stat().st_mtime,
|
||||
'manifest_rows': item.get('rows'),
|
||||
'rows_parsed': rows,
|
||||
'row_count_matches_manifest': rows == item.get('rows'),
|
||||
'manifest_sha256': item.get('sha256'),
|
||||
'actual_sha256': actual_sha,
|
||||
'sha256_matches_manifest': actual_sha == item.get('sha256'),
|
||||
'under_raw_dir': raw_path.resolve().is_relative_to(raw_dir.resolve()),
|
||||
'first_row_keys': first_keys,
|
||||
})
|
||||
valid = (
|
||||
manifest.get('gate_status') == 'PASS'
|
||||
and manifest.get('rows_written', 0) > 0
|
||||
and manifest.get('failure_count') == 0
|
||||
and not manifest.get('failures')
|
||||
and bool(output_files)
|
||||
and all(item['rows_parsed'] > 0 and item['row_count_matches_manifest'] and item['sha256_matches_manifest'] and item['under_raw_dir'] for item in output_files)
|
||||
)
|
||||
return {
|
||||
'path': str(path),
|
||||
'mtime': path.stat().st_mtime,
|
||||
'manifest_summary': {
|
||||
'gate_status': manifest.get('gate_status'),
|
||||
'rows_written': manifest.get('rows_written'),
|
||||
'failure_count': manifest.get('failure_count'),
|
||||
'failures_present': bool(manifest.get('failures')),
|
||||
'output_file_count': len(manifest.get('output_files', [])),
|
||||
'started_at_utc': manifest.get('started_at_utc'),
|
||||
'ended_at_utc': manifest.get('ended_at_utc'),
|
||||
},
|
||||
'output_files': output_files,
|
||||
'valid': valid,
|
||||
}
|
||||
|
||||
candidates = sorted(manifest_dir.glob('polymarket_orderbook_collector_*.json'), key=lambda p: p.stat().st_mtime)
|
||||
candidates = [path for path in candidates if path.stat().st_mtime > after_mtime]
|
||||
latest = None
|
||||
for path in reversed(candidates):
|
||||
try:
|
||||
result = validate(path)
|
||||
except Exception as exc:
|
||||
latest = {'path': str(path), 'valid': False, 'error': repr(exc)}
|
||||
continue
|
||||
latest = result
|
||||
if result['valid']:
|
||||
print(json.dumps(result, sort_keys=True))
|
||||
sys.exit(0)
|
||||
print(json.dumps(latest or {'valid': False, 'error': 'no collector manifest candidates'}, sort_keys=True))
|
||||
sys.exit(2)
|
||||
'''
|
||||
|
||||
raw_check_code = r'''
|
||||
import gzip
|
||||
import hashlib
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
path = Path(sys.argv[1])
|
||||
expected_sha = sys.argv[2]
|
||||
expected_rows = int(sys.argv[3])
|
||||
|
||||
def sha256(path):
|
||||
digest = hashlib.sha256()
|
||||
with path.open('rb') as handle:
|
||||
for chunk in iter(lambda: handle.read(1024 * 1024), b''):
|
||||
digest.update(chunk)
|
||||
return digest.hexdigest()
|
||||
|
||||
rows = 0
|
||||
with gzip.open(path, 'rt', encoding='utf-8') as handle:
|
||||
for line in handle:
|
||||
if line.strip():
|
||||
json.loads(line)
|
||||
rows += 1
|
||||
actual_sha = sha256(path)
|
||||
print(json.dumps({
|
||||
'path': str(path),
|
||||
'expected_sha256': expected_sha,
|
||||
'actual_sha256': actual_sha,
|
||||
'sha256_matches': actual_sha == expected_sha,
|
||||
'expected_rows': expected_rows,
|
||||
'actual_rows': rows,
|
||||
'row_count_matches': rows == expected_rows,
|
||||
}, sort_keys=True))
|
||||
'''
|
||||
|
||||
upload_validation_code = r'''
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
manifest_dir = Path(sys.argv[1])
|
||||
after_mtime = float(sys.argv[2])
|
||||
candidates = sorted(manifest_dir.glob('upload_archive_*.json'), key=lambda p: p.stat().st_mtime)
|
||||
candidates = [path for path in candidates if path.stat().st_mtime >= after_mtime]
|
||||
if not candidates:
|
||||
print(json.dumps({'valid': False, 'error': 'no upload manifest candidates'}, sort_keys=True))
|
||||
sys.exit(2)
|
||||
path = candidates[-1]
|
||||
manifest = json.loads(path.read_text(encoding='utf-8'))
|
||||
verified_count = manifest.get('counts', {}).get('verified', len(manifest.get('verified_files', [])))
|
||||
valid = (
|
||||
manifest.get('operation_status') == 'UPLOAD_VERIFIED'
|
||||
and manifest.get('gate_status') == 'PASS'
|
||||
and manifest.get('rclone', {}).get('copy_exit_code') == 0
|
||||
and manifest.get('rclone', {}).get('check_exit_code') == 0
|
||||
and verified_count > 0
|
||||
)
|
||||
verified_files = manifest.get('verified_files', [])
|
||||
print(json.dumps({
|
||||
'path': str(path),
|
||||
'mtime': path.stat().st_mtime,
|
||||
'manifest_summary': {
|
||||
'operation_status': manifest.get('operation_status'),
|
||||
'gate_status': manifest.get('gate_status'),
|
||||
'counts': manifest.get('counts', {}),
|
||||
'planned_file_count': len(manifest.get('planned_files', [])),
|
||||
'attempted_file_count': len(manifest.get('attempted_files', [])),
|
||||
'uploaded_file_count': len(manifest.get('uploaded_files', [])),
|
||||
'verified_file_count': verified_count,
|
||||
'rclone_copy_exit_code': manifest.get('rclone', {}).get('copy_exit_code'),
|
||||
'rclone_check_exit_code': manifest.get('rclone', {}).get('check_exit_code'),
|
||||
'started_at_utc': manifest.get('started_at_utc'),
|
||||
'ended_at_utc': manifest.get('ended_at_utc'),
|
||||
},
|
||||
'verified_count': verified_count,
|
||||
'verified_file_samples': [
|
||||
{
|
||||
'relative_path': item.get('relative_path'),
|
||||
'bytes': item.get('bytes'),
|
||||
'sha256': item.get('sha256'),
|
||||
'kind': item.get('kind'),
|
||||
}
|
||||
for item in verified_files[:5]
|
||||
],
|
||||
'valid': valid,
|
||||
}, sort_keys=True))
|
||||
if not valid:
|
||||
sys.exit(2)
|
||||
'''
|
||||
|
||||
upload_eligibility_code = r'''
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
raw_dir = Path(sys.argv[1])
|
||||
manifest_dir = Path(sys.argv[2])
|
||||
min_age_seconds = int(sys.argv[3])
|
||||
now = time.time()
|
||||
|
||||
def eligible_files(root, pattern):
|
||||
if not root.exists():
|
||||
return []
|
||||
items = []
|
||||
for path in sorted(root.rglob(pattern)):
|
||||
if not path.is_file():
|
||||
continue
|
||||
age = max(0, int(now - path.stat().st_mtime))
|
||||
if age >= min_age_seconds:
|
||||
items.append({'path': str(path), 'bytes': path.stat().st_size, 'age_seconds': age})
|
||||
return items
|
||||
|
||||
raw_files = eligible_files(raw_dir, '*.jsonl.gz')
|
||||
manifest_files = eligible_files(manifest_dir, 'polymarket_orderbook_collector_*.json')
|
||||
print(json.dumps({
|
||||
'eligible': bool(raw_files) and bool(manifest_files),
|
||||
'min_age_seconds': min_age_seconds,
|
||||
'raw_eligible_count': len(raw_files),
|
||||
'manifest_eligible_count': len(manifest_files),
|
||||
'raw_sample': raw_files[:3],
|
||||
'manifest_sample': manifest_files[:3],
|
||||
}, sort_keys=True))
|
||||
'''
|
||||
|
||||
summary = {
|
||||
'schema_name': 'k8s_runtime_smoke_result',
|
||||
'schema_version': 1,
|
||||
'started_at_utc': started_at,
|
||||
'ended_at_utc': None,
|
||||
'gate_status': 'ERROR',
|
||||
'production_ready': False,
|
||||
'namespace': namespace,
|
||||
'deployment': deployment,
|
||||
'cronjob': cronjob,
|
||||
'raw_dir': raw_dir,
|
||||
'manifest_dir': manifest_dir,
|
||||
'upload_min_age_seconds': upload_min_age_seconds,
|
||||
'checks': checks,
|
||||
'failures': failures,
|
||||
}
|
||||
|
||||
try:
|
||||
rollout = run([kubectl, '-n', namespace, 'rollout', 'status', f'deployment/{deployment}', f'--timeout={wait_seconds}s'])
|
||||
if rollout['exit_code'] != 0:
|
||||
raise RuntimeError('collector deployment rollout is not healthy')
|
||||
pod_name, pod_obj = get_collector_pod()
|
||||
before = wait_for_valid_collector(pod_name, 0, 'initial')
|
||||
before_mtime = before['mtime']
|
||||
old_file = before['output_files'][0]
|
||||
|
||||
delete_pod = run([kubectl, '-n', namespace, 'delete', 'pod', pod_name, '--wait=false'])
|
||||
if delete_pod['exit_code'] != 0:
|
||||
raise RuntimeError('failed to delete collector pod for restart test')
|
||||
rollout_after = run([kubectl, '-n', namespace, 'rollout', 'status', f'deployment/{deployment}', f'--timeout={wait_seconds}s'])
|
||||
if rollout_after['exit_code'] != 0:
|
||||
raise RuntimeError('collector deployment did not recover after pod delete')
|
||||
new_pod, new_pod_obj = get_collector_pod()
|
||||
old_check = exec_python(new_pod, raw_check_code, [old_file['path'], old_file['actual_sha256'], str(old_file['rows_parsed'])])
|
||||
if not old_check.get('sha256_matches') or not old_check.get('row_count_matches'):
|
||||
raise RuntimeError('old raw file changed or stopped parsing after pod restart')
|
||||
|
||||
after = wait_for_valid_collector(new_pod, before_mtime, 'post_restart')
|
||||
upload_eligibility = wait_for_upload_eligible_files(new_pod)
|
||||
|
||||
upload_start_mtime = time.time() - 2
|
||||
job_name = 'orderbooks-uploader-smoke-' + dt.datetime.now(dt.UTC).strftime('%Y%m%dt%H%M%Sz').lower()
|
||||
run([kubectl, '-n', namespace, 'delete', 'job', job_name, '--ignore-not-found=true'])
|
||||
create_job = run([kubectl, '-n', namespace, 'create', 'job', job_name, f'--from=cronjob/{cronjob}'])
|
||||
if create_job['exit_code'] != 0:
|
||||
raise RuntimeError('failed to create uploader smoke job from CronJob')
|
||||
wait_upload = run([kubectl, '-n', namespace, 'wait', '--for=condition=Complete', f'--timeout={wait_seconds}s', f'job/{job_name}'])
|
||||
logs = run([kubectl, '-n', namespace, 'logs', f'job/{job_name}'])
|
||||
if wait_upload['exit_code'] != 0:
|
||||
raise RuntimeError('uploader smoke job did not complete')
|
||||
upload = exec_python(new_pod, upload_validation_code, [manifest_dir, str(upload_start_mtime)])
|
||||
if not upload.get('valid'):
|
||||
raise RuntimeError('upload manifest did not verify at least one file')
|
||||
|
||||
summary.update({
|
||||
'initial_collector_pod': pod_name,
|
||||
'post_restart_collector_pod': new_pod,
|
||||
'before_restart_collector': before,
|
||||
'old_raw_file_after_restart': old_check,
|
||||
'after_restart_collector': after,
|
||||
'upload_eligibility': upload_eligibility,
|
||||
'uploader_job': job_name,
|
||||
'upload_result': upload,
|
||||
'uploader_log_check_exit_code': logs['exit_code'],
|
||||
})
|
||||
summary['gate_status'] = 'PASS'
|
||||
except Exception as exc:
|
||||
failures.append(str(exc))
|
||||
summary['exception'] = repr(exc)
|
||||
summary['gate_status'] = 'FAIL'
|
||||
finally:
|
||||
summary['ended_at_utc'] = iso_now()
|
||||
evidence_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
evidence_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + '\n', encoding='utf-8')
|
||||
|
||||
print(f'K8S_SMOKE_EVIDENCE={evidence_path}')
|
||||
print(f'K8S_SMOKE_GATE={summary["gate_status"]}')
|
||||
if summary['gate_status'] != 'PASS':
|
||||
sys.exit(1)
|
||||
PY_SMOKE
|
||||
496
scripts/normalize_polymarket_orderbooks.py
Normal file
496
scripts/normalize_polymarket_orderbooks.py
Normal file
|
|
@ -0,0 +1,496 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Normalize raw Polymarket order-book snapshots from the sample collector.
|
||||
|
||||
Checkpoint 5 scope: derive a bounded normalized gzip JSONL sample from the raw
|
||||
Checkpoint 4 sample. Raw files remain the source of truth; every normalized row
|
||||
keeps the raw file path and gzip JSONL line number.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import datetime as dt
|
||||
import gzip
|
||||
import hashlib
|
||||
import json
|
||||
import sys
|
||||
from decimal import Decimal, InvalidOperation, getcontext
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
NORMALIZER_NAME = "polymarket_orderbook_normalizer"
|
||||
NORMALIZER_VERSION = "0.1.0"
|
||||
SCHEMA_NAME = "normalized_orderbook_snapshot"
|
||||
SCHEMA_VERSION = 1
|
||||
|
||||
DEFAULT_INPUT_MANIFEST = Path("data/manifests/orderbook_collector_sample_manifest.json")
|
||||
DEFAULT_OUTPUT_DIR = Path("data/normalized_sample")
|
||||
DEFAULT_MANIFEST_PATH = Path("data/manifests/orderbook_normalization_sample_manifest.json")
|
||||
|
||||
CENT_OFFSETS = {
|
||||
"1c": Decimal("0.01"),
|
||||
"2c": Decimal("0.02"),
|
||||
"5c": Decimal("0.05"),
|
||||
}
|
||||
|
||||
SECRET_PATTERNS = (
|
||||
"set-" "coo" "kie",
|
||||
"__cf" "_bm",
|
||||
"cf" "_bm",
|
||||
"author" "ization",
|
||||
"private" "_key",
|
||||
"api" "_secret",
|
||||
"poly" "_signature",
|
||||
"poly" "_passphrase",
|
||||
"poly" "_address",
|
||||
"bear" "er",
|
||||
"coo" "kie",
|
||||
"wallet" " material",
|
||||
)
|
||||
|
||||
|
||||
getcontext().prec = 50
|
||||
|
||||
|
||||
def utc_now() -> dt.datetime:
|
||||
return dt.datetime.now(dt.UTC)
|
||||
|
||||
|
||||
def iso_z(value: dt.datetime | None = None) -> str:
|
||||
value = value or utc_now()
|
||||
return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
||||
|
||||
|
||||
def compact_timestamp(value: dt.datetime | None = None) -> str:
|
||||
value = value or utc_now()
|
||||
return value.astimezone(dt.UTC).strftime("%Y%m%dT%H%M%SZ")
|
||||
|
||||
|
||||
def sha256_file(path: Path) -> str:
|
||||
digest = hashlib.sha256()
|
||||
with path.open("rb") as handle:
|
||||
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
||||
digest.update(chunk)
|
||||
return digest.hexdigest()
|
||||
|
||||
|
||||
def decimal_from_raw(value: Any, field_name: str) -> Decimal:
|
||||
if not isinstance(value, str):
|
||||
raise ValueError(f"{field_name} is not a string: {value!r}")
|
||||
try:
|
||||
parsed = Decimal(value)
|
||||
except InvalidOperation as exc:
|
||||
raise ValueError(f"{field_name} is not a decimal: {value!r}") from exc
|
||||
if not parsed.is_finite():
|
||||
raise ValueError(f"{field_name} is not finite: {value!r}")
|
||||
return parsed
|
||||
|
||||
|
||||
def decimal_to_json(value: Decimal | None) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
if value == 0:
|
||||
return "0"
|
||||
return format(value.normalize(), "f")
|
||||
|
||||
|
||||
def load_json(path: Path) -> dict[str, Any]:
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
data = json.load(handle)
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"{path} did not contain a JSON object")
|
||||
return data
|
||||
|
||||
|
||||
def resolve_repo_path(path_text: str) -> Path:
|
||||
path = Path(path_text)
|
||||
if path.is_absolute():
|
||||
return path
|
||||
return Path.cwd() / path
|
||||
|
||||
|
||||
def normalize_side(levels: Any, side_name: str) -> list[tuple[Decimal, Decimal]]:
|
||||
if not isinstance(levels, list):
|
||||
raise ValueError(f"raw.{side_name} is not a list")
|
||||
normalized: list[tuple[Decimal, Decimal]] = []
|
||||
for index, level in enumerate(levels):
|
||||
if not isinstance(level, dict):
|
||||
raise ValueError(f"raw.{side_name}[{index}] is not an object")
|
||||
price = decimal_from_raw(level.get("price"), f"raw.{side_name}[{index}].price")
|
||||
size = decimal_from_raw(level.get("size"), f"raw.{side_name}[{index}].size")
|
||||
if size < 0:
|
||||
raise ValueError(f"raw.{side_name}[{index}].size is negative")
|
||||
normalized.append((price, size))
|
||||
return normalized
|
||||
|
||||
|
||||
def sum_sizes(levels: list[tuple[Decimal, Decimal]]) -> Decimal:
|
||||
return sum((size for _, size in levels), Decimal("0"))
|
||||
|
||||
|
||||
def normalize_raw_row(raw_row: dict[str, Any], raw_file: str, raw_line_number: int) -> dict[str, Any]:
|
||||
raw_book = raw_row.get("raw")
|
||||
market = raw_row.get("market")
|
||||
collection = raw_row.get("collection")
|
||||
if not isinstance(raw_book, dict):
|
||||
raise ValueError("raw is not an object")
|
||||
if not isinstance(market, dict):
|
||||
raise ValueError("market is not an object")
|
||||
if not isinstance(collection, dict):
|
||||
raise ValueError("collection is not an object")
|
||||
|
||||
bids = normalize_side(raw_book.get("bids"), "bids")
|
||||
asks = normalize_side(raw_book.get("asks"), "asks")
|
||||
|
||||
best_bid = max((price for price, _ in bids), default=None)
|
||||
best_ask = min((price for price, _ in asks), default=None)
|
||||
spread = None
|
||||
midpoint = None
|
||||
if best_bid is not None and best_ask is not None:
|
||||
spread = best_ask - best_bid
|
||||
midpoint = (best_bid + best_ask) / Decimal("2")
|
||||
|
||||
bid_depth_total = sum_sizes(bids)
|
||||
ask_depth_total = sum_sizes(asks)
|
||||
|
||||
row: dict[str, Any] = {
|
||||
"schema_name": SCHEMA_NAME,
|
||||
"schema_version": SCHEMA_VERSION,
|
||||
"market_name": market.get("market_name"),
|
||||
"market_slug": market.get("market_slug"),
|
||||
"condition_id": market.get("condition_id"),
|
||||
"token_id": market.get("token_id"),
|
||||
"outcome": market.get("outcome"),
|
||||
"collected_at_utc": collection.get("collected_at_utc"),
|
||||
"best_bid": decimal_to_json(best_bid),
|
||||
"best_ask": decimal_to_json(best_ask),
|
||||
"spread": decimal_to_json(spread),
|
||||
"midpoint": decimal_to_json(midpoint),
|
||||
"bid_depth_total": decimal_to_json(bid_depth_total),
|
||||
"ask_depth_total": decimal_to_json(ask_depth_total),
|
||||
"raw_file": raw_file,
|
||||
"raw_line_number": raw_line_number,
|
||||
}
|
||||
|
||||
for label, offset in CENT_OFFSETS.items():
|
||||
bid_depth = Decimal("0")
|
||||
if best_bid is not None:
|
||||
threshold = best_bid - offset
|
||||
bid_depth = sum((size for price, size in bids if price >= threshold), Decimal("0"))
|
||||
ask_depth = Decimal("0")
|
||||
if best_ask is not None:
|
||||
threshold = best_ask + offset
|
||||
ask_depth = sum((size for price, size in asks if price <= threshold), Decimal("0"))
|
||||
row[f"bid_depth_within_{label}"] = decimal_to_json(bid_depth)
|
||||
row[f"ask_depth_within_{label}"] = decimal_to_json(ask_depth)
|
||||
|
||||
return row
|
||||
|
||||
|
||||
def summarize_output(path: Path, rows: int) -> dict[str, Any]:
|
||||
return {
|
||||
"path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path),
|
||||
"rows": rows,
|
||||
"bytes": path.stat().st_size,
|
||||
"sha256": sha256_file(path),
|
||||
"status": "valid",
|
||||
}
|
||||
|
||||
|
||||
def build_input_file_summary(manifest: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
files = manifest.get("output_files")
|
||||
if not isinstance(files, list) or not files:
|
||||
raise ValueError("input manifest has no output_files")
|
||||
summaries: list[dict[str, Any]] = []
|
||||
for file_entry in files:
|
||||
if not isinstance(file_entry, dict):
|
||||
raise ValueError("input manifest output_files entry is not an object")
|
||||
path_text = file_entry.get("path")
|
||||
if not isinstance(path_text, str) or not path_text:
|
||||
raise ValueError("input manifest output_files entry lacks path")
|
||||
path = resolve_repo_path(path_text)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(path)
|
||||
actual_sha = sha256_file(path)
|
||||
expected_sha = file_entry.get("sha256")
|
||||
checksum_match = expected_sha == actual_sha
|
||||
summaries.append(
|
||||
{
|
||||
"path": path_text,
|
||||
"rows_expected": file_entry.get("rows"),
|
||||
"bytes": path.stat().st_size,
|
||||
"sha256": actual_sha,
|
||||
"input_manifest_sha256": expected_sha,
|
||||
"checksum_match": checksum_match,
|
||||
"status": "valid" if checksum_match else "invalid",
|
||||
}
|
||||
)
|
||||
return summaries
|
||||
|
||||
|
||||
def read_and_normalize(
|
||||
input_files: list[dict[str, Any]],
|
||||
output_path: Path,
|
||||
) -> tuple[int, int, list[dict[str, Any]], dict[str, Any]]:
|
||||
raw_rows_read = 0
|
||||
normalized_rows_written = 0
|
||||
errors: list[dict[str, Any]] = []
|
||||
sanity = {
|
||||
"raw_file_refs_present": True,
|
||||
"raw_files_exist": True,
|
||||
"spread_non_negative": True,
|
||||
"midpoint_between_bid_ask": True,
|
||||
"depth_totals_non_negative": True,
|
||||
"outcomes_seen": [],
|
||||
"gzip_jsonl_parseable": True,
|
||||
"row_count_match": None,
|
||||
}
|
||||
outcomes_seen: set[str] = set()
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with gzip.open(output_path, "wt", encoding="utf-8", compresslevel=9) as output:
|
||||
for file_entry in input_files:
|
||||
raw_file = file_entry["path"]
|
||||
raw_path = resolve_repo_path(raw_file)
|
||||
if not raw_path.exists():
|
||||
sanity["raw_files_exist"] = False
|
||||
errors.append({"raw_file": raw_file, "error": "raw file missing"})
|
||||
continue
|
||||
|
||||
with gzip.open(raw_path, "rt", encoding="utf-8") as raw_handle:
|
||||
for raw_line_number, line in enumerate(raw_handle, 1):
|
||||
raw_rows_read += 1
|
||||
try:
|
||||
raw_row = json.loads(line)
|
||||
normalized = normalize_raw_row(raw_row, raw_file, raw_line_number)
|
||||
output.write(json.dumps(normalized, sort_keys=True, separators=(",", ":")) + "\n")
|
||||
normalized_rows_written += 1
|
||||
|
||||
if not normalized.get("raw_file") or not normalized.get("raw_line_number"):
|
||||
sanity["raw_file_refs_present"] = False
|
||||
if not resolve_repo_path(str(normalized["raw_file"])).exists():
|
||||
sanity["raw_files_exist"] = False
|
||||
outcome = normalized.get("outcome")
|
||||
if isinstance(outcome, str):
|
||||
outcomes_seen.add(outcome)
|
||||
|
||||
best_bid = Decimal(normalized["best_bid"]) if normalized["best_bid"] is not None else None
|
||||
best_ask = Decimal(normalized["best_ask"]) if normalized["best_ask"] is not None else None
|
||||
spread = Decimal(normalized["spread"]) if normalized["spread"] is not None else None
|
||||
midpoint = Decimal(normalized["midpoint"]) if normalized["midpoint"] is not None else None
|
||||
if best_bid is not None and best_ask is not None:
|
||||
if spread is None or spread < 0:
|
||||
sanity["spread_non_negative"] = False
|
||||
if midpoint is None or midpoint < best_bid or midpoint > best_ask:
|
||||
sanity["midpoint_between_bid_ask"] = False
|
||||
depth_fields = [
|
||||
"bid_depth_total",
|
||||
"ask_depth_total",
|
||||
"bid_depth_within_1c",
|
||||
"ask_depth_within_1c",
|
||||
"bid_depth_within_2c",
|
||||
"ask_depth_within_2c",
|
||||
"bid_depth_within_5c",
|
||||
"ask_depth_within_5c",
|
||||
]
|
||||
for field in depth_fields:
|
||||
if Decimal(normalized[field]) < 0:
|
||||
sanity["depth_totals_non_negative"] = False
|
||||
except Exception as exc: # noqa: BLE001 - preserve row-level failure evidence.
|
||||
errors.append(
|
||||
{
|
||||
"raw_file": raw_file,
|
||||
"raw_line_number": raw_line_number,
|
||||
"error": str(exc),
|
||||
}
|
||||
)
|
||||
|
||||
sanity["outcomes_seen"] = sorted(outcomes_seen)
|
||||
sanity["has_up_and_down"] = {"Up", "Down"}.issubset(outcomes_seen)
|
||||
sanity["row_count_match"] = raw_rows_read == normalized_rows_written + len(errors)
|
||||
return raw_rows_read, normalized_rows_written, errors, sanity
|
||||
|
||||
|
||||
def validate_output_gzip_jsonl(path: Path) -> tuple[bool, int, list[str]]:
|
||||
errors: list[str] = []
|
||||
parsed_rows = 0
|
||||
try:
|
||||
with gzip.open(path, "rt", encoding="utf-8") as handle:
|
||||
for line_number, line in enumerate(handle, 1):
|
||||
json.loads(line)
|
||||
parsed_rows = line_number
|
||||
except Exception as exc: # noqa: BLE001 - validation result belongs in manifest.
|
||||
errors.append(str(exc))
|
||||
return not errors, parsed_rows, errors
|
||||
|
||||
|
||||
def scan_for_secret_terms(paths: list[Path]) -> dict[str, Any]:
|
||||
matches: list[dict[str, Any]] = []
|
||||
lowered_patterns = tuple(pattern.lower() for pattern in SECRET_PATTERNS)
|
||||
for path in paths:
|
||||
if not path.exists():
|
||||
continue
|
||||
if path.suffix == ".gz":
|
||||
opener = gzip.open
|
||||
else:
|
||||
opener = open
|
||||
with opener(path, "rt", encoding="utf-8", errors="replace") as handle: # type: ignore[arg-type]
|
||||
for line_number, line in enumerate(handle, 1):
|
||||
lower = line.lower()
|
||||
for pattern_index, pattern in enumerate(lowered_patterns, 1):
|
||||
if pattern in lower:
|
||||
matches.append(
|
||||
{
|
||||
"path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path),
|
||||
"line_number": line_number,
|
||||
"term_index": pattern_index,
|
||||
}
|
||||
)
|
||||
break
|
||||
return {
|
||||
"passed": not matches,
|
||||
"checked_term_count": len(SECRET_PATTERNS),
|
||||
"matches": matches,
|
||||
}
|
||||
|
||||
|
||||
def parse_args(argv: list[str]) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Normalize Checkpoint 4 raw Polymarket order-book snapshots.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-manifest",
|
||||
type=Path,
|
||||
default=DEFAULT_INPUT_MANIFEST,
|
||||
help=f"Raw collector manifest path. Default: {DEFAULT_INPUT_MANIFEST}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=DEFAULT_OUTPUT_DIR,
|
||||
help=f"Normalized sample base directory. Default: {DEFAULT_OUTPUT_DIR}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--manifest-path",
|
||||
type=Path,
|
||||
default=DEFAULT_MANIFEST_PATH,
|
||||
help=f"Normalization manifest path. Default: {DEFAULT_MANIFEST_PATH}",
|
||||
)
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def main(argv: list[str]) -> int:
|
||||
args = parse_args(argv)
|
||||
started = utc_now()
|
||||
input_manifest = load_json(args.input_manifest)
|
||||
input_files = build_input_file_summary(input_manifest)
|
||||
|
||||
run_id = compact_timestamp(started)
|
||||
output_path = (
|
||||
args.output_dir
|
||||
/ "polymarket"
|
||||
/ "orderbooks"
|
||||
/ run_id
|
||||
/ f"polymarket_orderbooks_normalized_{run_id}.jsonl.gz"
|
||||
)
|
||||
|
||||
raw_rows_read, normalized_rows_written, row_errors, sanity = read_and_normalize(input_files, output_path)
|
||||
gzip_ok, gzip_rows, gzip_errors = validate_output_gzip_jsonl(output_path)
|
||||
output_summary = summarize_output(output_path, normalized_rows_written)
|
||||
|
||||
sanity.update(
|
||||
{
|
||||
"output_row_count_equals_raw_input_row_count": normalized_rows_written == raw_rows_read
|
||||
if not row_errors
|
||||
else False,
|
||||
"gzip_jsonl_decompresses_and_parses": gzip_ok,
|
||||
"gzip_jsonl_rows_parsed": gzip_rows,
|
||||
"gzip_jsonl_errors": gzip_errors,
|
||||
"manifest_checksum_matches_output": output_summary["sha256"] == sha256_file(output_path),
|
||||
"all_input_file_checksums_match": all(file_entry["checksum_match"] for file_entry in input_files),
|
||||
}
|
||||
)
|
||||
|
||||
secret_scan = scan_for_secret_terms([Path(__file__), output_path])
|
||||
sanity["checkpoint5_secret_scan_passed"] = secret_scan["passed"]
|
||||
|
||||
gate_checks = [
|
||||
normalized_rows_written == raw_rows_read,
|
||||
not row_errors,
|
||||
sanity["raw_file_refs_present"],
|
||||
sanity["raw_files_exist"],
|
||||
sanity["spread_non_negative"],
|
||||
sanity["midpoint_between_bid_ask"],
|
||||
sanity["depth_totals_non_negative"],
|
||||
sanity["has_up_and_down"],
|
||||
gzip_ok,
|
||||
sanity["manifest_checksum_matches_output"],
|
||||
secret_scan["passed"],
|
||||
all(file_entry["checksum_match"] for file_entry in input_files),
|
||||
]
|
||||
gate_status = "PASS" if all(gate_checks) and normalized_rows_written > 0 else "FAIL"
|
||||
ended = utc_now()
|
||||
|
||||
manifest = {
|
||||
"schema_name": "orderbook_normalization_sample_manifest",
|
||||
"schema_version": 1,
|
||||
"checkpoint_id": 5,
|
||||
"checkpoint_name": "Normalized Snapshot Extract",
|
||||
"normalizer": {
|
||||
"name": NORMALIZER_NAME,
|
||||
"version": NORMALIZER_VERSION,
|
||||
},
|
||||
"started_at_utc": iso_z(started),
|
||||
"ended_at_utc": iso_z(ended),
|
||||
"run_duration_seconds": round((ended - started).total_seconds(), 3),
|
||||
"command": "scripts/normalize_polymarket_orderbooks.py",
|
||||
"input_manifest": {
|
||||
"path": str(args.input_manifest),
|
||||
"sha256": sha256_file(args.input_manifest),
|
||||
"collector_manifest_schema_name": input_manifest.get("schema_name"),
|
||||
"collector_gate_status": input_manifest.get("gate_status"),
|
||||
},
|
||||
"input_files": input_files,
|
||||
"output_files": [output_summary],
|
||||
"raw_rows_read": raw_rows_read,
|
||||
"normalized_rows_written": normalized_rows_written,
|
||||
"skipped_rows": len(row_errors),
|
||||
"error_rows": row_errors,
|
||||
"numeric_encoding": "Exact decimal values are emitted as JSON strings; missing price-derived values are null.",
|
||||
"sanity_checks": sanity,
|
||||
"secret_scan": secret_scan,
|
||||
"warnings": [],
|
||||
"known_gaps": [
|
||||
"This is a derived sample extract only; raw gzip JSONL remains the source of truth.",
|
||||
"No upload, daemon runtime, systemd unit, dashboard, database, strategy, backtest, or trading behavior is included.",
|
||||
"The sample proves normalization logic on one bounded raw run, not long-run schema stability.",
|
||||
],
|
||||
"fake_progress_risk": "A clean normalized sample can hide raw collection gaps and endpoint schema drift; every row is therefore traceable to raw_file and raw_line_number, and reliability remains gated on later soak testing.",
|
||||
"next_step": "Checkpoint 6 should package the raw collector for a VPS runtime, or the orchestrator can request review of this normalized sample first.",
|
||||
"gate_status": gate_status,
|
||||
}
|
||||
|
||||
args.manifest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
args.manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"gate_status": gate_status,
|
||||
"manifest_path": str(args.manifest_path),
|
||||
"output_path": str(output_path),
|
||||
"raw_rows_read": raw_rows_read,
|
||||
"normalized_rows_written": normalized_rows_written,
|
||||
"skipped_rows": len(row_errors),
|
||||
"sha256": output_summary["sha256"],
|
||||
},
|
||||
indent=2,
|
||||
sort_keys=True,
|
||||
)
|
||||
)
|
||||
return 0 if gate_status == "PASS" else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main(sys.argv[1:]))
|
||||
1369
scripts/probe_polymarket_public_sources.py
Executable file
1369
scripts/probe_polymarket_public_sources.py
Executable file
File diff suppressed because it is too large
Load diff
362
scripts/run_polymarket_24h_soak.sh
Executable file
362
scripts/run_polymarket_24h_soak.sh
Executable file
|
|
@ -0,0 +1,362 @@
|
|||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
APP_DIR="${ORDERBOOKS_APP_DIR:-$(pwd)}"
|
||||
PYTHON_BIN="${ORDERBOOKS_PYTHON:-python3}"
|
||||
RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-/usr/bin/rclone}"
|
||||
RCLONE_DEST_BASE="${ORDERBOOKS_RCLONE_DEST:-gdrive:orderbooks/polymarket/soak-test}"
|
||||
|
||||
SOAK_DATE="${ORDERBOOKS_SOAK_DATE:-$(date -u +%F)}"
|
||||
SOAK_ID="${ORDERBOOKS_SOAK_ID:-soak_test_${SOAK_DATE}}"
|
||||
SOAK_SECONDS="${ORDERBOOKS_SOAK_SECONDS:-86400}"
|
||||
CYCLE_SECONDS="${ORDERBOOKS_SOAK_CYCLE_SECONDS:-300}"
|
||||
INTERVAL_SECONDS="${ORDERBOOKS_SOAK_INTERVAL_SECONDS:-30}"
|
||||
MARKET_LIMIT="${ORDERBOOKS_SOAK_MARKET_LIMIT:-2}"
|
||||
MARKET_END_SAFETY_SECONDS="${ORDERBOOKS_SOAK_MARKET_END_SAFETY_SECONDS:-420}"
|
||||
REQUEST_TIMEOUT_SECONDS="${ORDERBOOKS_SOAK_REQUEST_TIMEOUT_SECONDS:-15}"
|
||||
MAX_RETRIES="${ORDERBOOKS_SOAK_MAX_RETRIES:-2}"
|
||||
BACKOFF_SECONDS="${ORDERBOOKS_SOAK_BACKOFF_SECONDS:-2}"
|
||||
DISCOVERY_LIMIT="${ORDERBOOKS_SOAK_DISCOVERY_LIMIT:-100}"
|
||||
DISCOVERY_MAX_PAGES="${ORDERBOOKS_SOAK_DISCOVERY_MAX_PAGES:-3}"
|
||||
DISCOVERY_TIMEOUT="${ORDERBOOKS_SOAK_DISCOVERY_TIMEOUT:-15}"
|
||||
|
||||
LOCAL_ROOT="${ORDERBOOKS_SOAK_LOCAL_ROOT:-data/soak_test/${SOAK_DATE}}"
|
||||
MANIFEST_ROOT="${ORDERBOOKS_SOAK_MANIFEST_ROOT:-data/manifests/${SOAK_ID}}"
|
||||
START_MANIFEST="${ORDERBOOKS_SOAK_START_MANIFEST:-data/manifests/${SOAK_ID}_start.json}"
|
||||
FINAL_MANIFEST="${ORDERBOOKS_SOAK_FINAL_MANIFEST:-data/manifests/${SOAK_ID}_final.json}"
|
||||
|
||||
DISCOVERY_DIR="${LOCAL_ROOT}/discovery"
|
||||
LIVE_DIR="${LOCAL_ROOT}/live_sample"
|
||||
LOG_DIR="${LOCAL_ROOT}/logs"
|
||||
PID_FILE="${LOCAL_ROOT}/soak.pid"
|
||||
CYCLES_JSONL="${MANIFEST_ROOT}/cycles.jsonl"
|
||||
LOG_FILE="${LOG_DIR}/soak.log"
|
||||
REMOTE_DEST="${RCLONE_DEST_BASE%/}/${SOAK_DATE}"
|
||||
|
||||
STOP_REQUESTED=0
|
||||
STOP_SIGNAL=""
|
||||
CURRENT_CHILD_PID=""
|
||||
CURRENT_PHASE="initializing"
|
||||
CURRENT_CYCLE_ID=""
|
||||
START_WRITTEN=0
|
||||
FINAL_WRITTEN=0
|
||||
|
||||
cd "${APP_DIR}" || exit 2
|
||||
mkdir -p "${DISCOVERY_DIR}" "${LIVE_DIR}" "${LOG_DIR}" "${MANIFEST_ROOT}" "$(dirname "${START_MANIFEST}")" "$(dirname "${FINAL_MANIFEST}")"
|
||||
|
||||
STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
START_EPOCH="$(date -u +%s)"
|
||||
END_EPOCH="$((START_EPOCH + SOAK_SECONDS))"
|
||||
EXPECTED_COMPLETION_AT="$(date -u -d "@${END_EPOCH}" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || python3 - <<PY
|
||||
import datetime as dt
|
||||
print(dt.datetime.fromtimestamp(${END_EPOCH}, dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z"))
|
||||
PY
|
||||
)"
|
||||
|
||||
safe_log() {
|
||||
printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" >> "${LOG_FILE}" 2>/dev/null || true
|
||||
}
|
||||
|
||||
log() {
|
||||
printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" | tee -a "${LOG_FILE}"
|
||||
}
|
||||
|
||||
handle_signal() {
|
||||
local signal_name="$1"
|
||||
STOP_REQUESTED=1
|
||||
STOP_SIGNAL="${signal_name}"
|
||||
safe_log "SIGNAL received=${signal_name} phase=${CURRENT_PHASE} cycle_id=${CURRENT_CYCLE_ID:-none}"
|
||||
if [[ -n "${CURRENT_CHILD_PID}" ]] && kill -0 "${CURRENT_CHILD_PID}" 2>/dev/null; then
|
||||
case "${signal_name}" in
|
||||
SIGINT) kill -INT "${CURRENT_CHILD_PID}" 2>/dev/null || true ;;
|
||||
SIGTERM) kill -TERM "${CURRENT_CHILD_PID}" 2>/dev/null || true ;;
|
||||
SIGHUP) kill -HUP "${CURRENT_CHILD_PID}" 2>/dev/null || true ;;
|
||||
esac
|
||||
fi
|
||||
}
|
||||
|
||||
write_start_manifest() {
|
||||
local tmp_path="${START_MANIFEST}.tmp"
|
||||
python3 - "$tmp_path" "$START_MANIFEST" <<PY
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
tmp_path = pathlib.Path(os.sys.argv[1])
|
||||
final_path = pathlib.Path(os.sys.argv[2])
|
||||
manifest = {
|
||||
"schema_name": "soak_test_start_manifest",
|
||||
"schema_version": 1,
|
||||
"checkpoint_id": 8,
|
||||
"checkpoint_name": "24h Soak Test Plan",
|
||||
"status": "STARTED",
|
||||
"started_at_utc": "${STARTED_AT}",
|
||||
"expected_completion_at_utc": "${EXPECTED_COMPLETION_AT}",
|
||||
"soak_seconds": int("${SOAK_SECONDS}"),
|
||||
"cycle_seconds": int("${CYCLE_SECONDS}"),
|
||||
"pid": int("$$"),
|
||||
"pid_file": "${PID_FILE}",
|
||||
"log_file": "${LOG_FILE}",
|
||||
"local_root": "${LOCAL_ROOT}",
|
||||
"manifest_root": "${MANIFEST_ROOT}",
|
||||
"remote_dest": "${REMOTE_DEST}",
|
||||
"raw_output_dir": "${LIVE_DIR}",
|
||||
"discovery_dir": "${DISCOVERY_DIR}",
|
||||
"cycles_jsonl": "${CYCLES_JSONL}",
|
||||
"gate_status": "IN_PROGRESS",
|
||||
"production_ready": False,
|
||||
"notes": [
|
||||
"This is a real 24h soak start marker, not a completion report.",
|
||||
"Checkpoint 8 cannot pass until 24 real hours elapse and final metrics are validated.",
|
||||
],
|
||||
}
|
||||
tmp_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||
os.replace(tmp_path, final_path)
|
||||
PY
|
||||
START_WRITTEN=1
|
||||
}
|
||||
|
||||
write_cycle_record() {
|
||||
local record="$1"
|
||||
printf '%s\n' "${record}" >> "${CYCLES_JSONL}"
|
||||
}
|
||||
|
||||
write_final_manifest() {
|
||||
local final_status="$1"
|
||||
local gate_status="$2"
|
||||
local exit_reason="$3"
|
||||
local ended_at
|
||||
local tmp_path
|
||||
ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
tmp_path="${FINAL_MANIFEST}.tmp"
|
||||
python3 - "$tmp_path" "$FINAL_MANIFEST" <<PY
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
tmp_path = pathlib.Path(os.sys.argv[1])
|
||||
final_path = pathlib.Path(os.sys.argv[2])
|
||||
cycles_path = pathlib.Path("${CYCLES_JSONL}")
|
||||
cycles = []
|
||||
if cycles_path.exists():
|
||||
cycles = [json.loads(line) for line in cycles_path.read_text(encoding="utf-8").splitlines() if line.strip()]
|
||||
manifest = {
|
||||
"schema_name": "soak_test_final_manifest",
|
||||
"schema_version": 1,
|
||||
"checkpoint_id": 8,
|
||||
"checkpoint_name": "24h Soak Test Plan",
|
||||
"status": "${final_status}",
|
||||
"gate_status": "${gate_status}",
|
||||
"exit_reason": "${exit_reason}",
|
||||
"started_at_utc": "${STARTED_AT}",
|
||||
"ended_at_utc": "${ended_at}",
|
||||
"expected_completion_at_utc": "${EXPECTED_COMPLETION_AT}",
|
||||
"soak_seconds": int("${SOAK_SECONDS}"),
|
||||
"cycle_seconds": int("${CYCLE_SECONDS}"),
|
||||
"cycles": cycles,
|
||||
"cycle_count": len(cycles),
|
||||
"ok_cycle_count": sum(1 for cycle in cycles if cycle.get("status") == "OK"),
|
||||
"error_cycle_count": sum(1 for cycle in cycles if cycle.get("status") == "ERROR"),
|
||||
"interrupted_cycle_count": sum(1 for cycle in cycles if cycle.get("status") == "INTERRUPTED"),
|
||||
"pid": int("$$"),
|
||||
"pid_file": "${PID_FILE}",
|
||||
"log_file": "${LOG_FILE}",
|
||||
"local_root": "${LOCAL_ROOT}",
|
||||
"manifest_root": "${MANIFEST_ROOT}",
|
||||
"remote_dest": "${REMOTE_DEST}",
|
||||
"stop_requested": bool(int("${STOP_REQUESTED}")),
|
||||
"stop_signal": "${STOP_SIGNAL}",
|
||||
"current_phase_at_exit": "${CURRENT_PHASE}",
|
||||
"current_cycle_id_at_exit": "${CURRENT_CYCLE_ID}",
|
||||
"production_ready": False,
|
||||
"notes": [
|
||||
"This marker is written by the soak controller on completion, interruption, or error.",
|
||||
"Checkpoint 8 cannot be PASS until 24 real hours elapse and final metrics are validated.",
|
||||
],
|
||||
}
|
||||
tmp_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||
os.replace(tmp_path, final_path)
|
||||
PY
|
||||
FINAL_WRITTEN=1
|
||||
}
|
||||
|
||||
cleanup_on_exit() {
|
||||
local rc=$?
|
||||
if [[ "${START_WRITTEN}" -eq 1 && "${FINAL_WRITTEN}" -eq 0 ]]; then
|
||||
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
|
||||
write_final_manifest "INTERRUPTED" "INTERRUPTED" "${STOP_SIGNAL:-signal}"
|
||||
elif [[ "${rc}" -ne 0 ]]; then
|
||||
write_final_manifest "ERROR" "ERROR" "exit_code_${rc}"
|
||||
else
|
||||
write_final_manifest "ERROR" "ERROR" "exited_without_final_marker"
|
||||
fi
|
||||
fi
|
||||
if [[ -f "${PID_FILE}" ]] && [[ "$(cat "${PID_FILE}" 2>/dev/null)" == "$$" ]]; then
|
||||
rm -f "${PID_FILE}"
|
||||
fi
|
||||
exit "${rc}"
|
||||
}
|
||||
|
||||
run_logged() {
|
||||
"$@" >> "${LOG_FILE}" 2>&1 &
|
||||
CURRENT_CHILD_PID="$!"
|
||||
wait "${CURRENT_CHILD_PID}"
|
||||
local rc=$?
|
||||
if [[ "${STOP_REQUESTED}" -eq 1 ]] && kill -0 "${CURRENT_CHILD_PID}" 2>/dev/null; then
|
||||
wait "${CURRENT_CHILD_PID}"
|
||||
rc=$?
|
||||
fi
|
||||
CURRENT_CHILD_PID=""
|
||||
return "${rc}"
|
||||
}
|
||||
|
||||
trap 'handle_signal SIGINT' INT
|
||||
trap 'handle_signal SIGTERM' TERM
|
||||
trap 'handle_signal SIGHUP' HUP
|
||||
trap cleanup_on_exit EXIT
|
||||
|
||||
echo "$$" > "${PID_FILE}"
|
||||
write_start_manifest
|
||||
test -s "${START_MANIFEST}" || exit 3
|
||||
|
||||
log "START soak_id=${SOAK_ID} pid=$$ expected_completion=${EXPECTED_COMPLETION_AT}"
|
||||
|
||||
cycle_index=0
|
||||
error_seen=0
|
||||
while true; do
|
||||
now_epoch="$(date -u +%s)"
|
||||
remaining="$((END_EPOCH - now_epoch))"
|
||||
if [[ "${remaining}" -le 0 ]]; then
|
||||
break
|
||||
fi
|
||||
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
|
||||
break
|
||||
fi
|
||||
if [[ "${remaining}" -lt 30 ]]; then
|
||||
log "SKIP final tiny remaining window seconds=${remaining}"
|
||||
break
|
||||
fi
|
||||
|
||||
cycle_index="$((cycle_index + 1))"
|
||||
cycle_id="$(date -u +%Y%m%dT%H%M%SZ)"
|
||||
CURRENT_CYCLE_ID="${cycle_id}"
|
||||
run_seconds="${CYCLE_SECONDS}"
|
||||
if [[ "${remaining}" -lt "${run_seconds}" ]]; then
|
||||
run_seconds="${remaining}"
|
||||
fi
|
||||
|
||||
discovery_json="${DISCOVERY_DIR}/polymarket_btc_markets_${cycle_id}.json"
|
||||
discovery_manifest="${DISCOVERY_DIR}/polymarket_btc_markets_manifest_${cycle_id}.json"
|
||||
discovery_markdown="${DISCOVERY_DIR}/polymarket_btc_markets_${cycle_id}.md"
|
||||
collector_manifest="${MANIFEST_ROOT}/collector_${cycle_id}.json"
|
||||
upload_manifest="${MANIFEST_ROOT}/upload_${cycle_id}.json"
|
||||
cycle_started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
|
||||
log "CYCLE ${cycle_index} start id=${cycle_id} run_seconds=${run_seconds}"
|
||||
|
||||
discovery_exit=0
|
||||
CURRENT_PHASE="discovery"
|
||||
run_logged "${PYTHON_BIN}" scripts/discover_polymarket_btc_markets.py \
|
||||
--output-json "${discovery_json}" \
|
||||
--manifest "${discovery_manifest}" \
|
||||
--markdown "${discovery_markdown}" \
|
||||
--limit "${DISCOVERY_LIMIT}" \
|
||||
--max-pages "${DISCOVERY_MAX_PAGES}" \
|
||||
--timeout "${DISCOVERY_TIMEOUT}" || discovery_exit=$?
|
||||
|
||||
collector_exit=0
|
||||
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
|
||||
collector_exit=98
|
||||
elif [[ "${discovery_exit}" -eq 0 ]]; then
|
||||
CURRENT_PHASE="collector"
|
||||
run_logged "${PYTHON_BIN}" scripts/collect_polymarket_orderbooks.py \
|
||||
--config config/polymarket_collector.vps.example.yaml \
|
||||
--discovery-path "${discovery_json}" \
|
||||
--output-dir "${LIVE_DIR}" \
|
||||
--manifest-path "${collector_manifest}" \
|
||||
--market-limit "${MARKET_LIMIT}" \
|
||||
--interval-seconds "${INTERVAL_SECONDS}" \
|
||||
--duration-seconds "${run_seconds}" \
|
||||
--request-timeout-seconds "${REQUEST_TIMEOUT_SECONDS}" \
|
||||
--max-retries "${MAX_RETRIES}" \
|
||||
--backoff-seconds "${BACKOFF_SECONDS}" \
|
||||
--market-end-safety-seconds "${MARKET_END_SAFETY_SECONDS}" || collector_exit=$?
|
||||
else
|
||||
collector_exit=99
|
||||
fi
|
||||
|
||||
upload_exit=0
|
||||
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
|
||||
upload_exit=98
|
||||
elif [[ "${collector_exit}" -eq 0 ]]; then
|
||||
CURRENT_PHASE="upload"
|
||||
run_logged scripts/upload_archive_rclone.sh \
|
||||
--execute \
|
||||
--data-dir "${LOCAL_ROOT}" \
|
||||
--raw-dir "${LIVE_DIR}" \
|
||||
--source-manifest-dir "${MANIFEST_ROOT}" \
|
||||
--manifest-dir "${MANIFEST_ROOT}" \
|
||||
--manifest-path "${upload_manifest}" \
|
||||
--dest "${REMOTE_DEST}" \
|
||||
--min-age-seconds 0 \
|
||||
--rclone-bin "${RCLONE_BIN}" || upload_exit=$?
|
||||
else
|
||||
upload_exit=99
|
||||
fi
|
||||
|
||||
cycle_ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
|
||||
cycle_status="INTERRUPTED"
|
||||
elif [[ "${discovery_exit}" -eq 0 && "${collector_exit}" -eq 0 && "${upload_exit}" -eq 0 ]]; then
|
||||
cycle_status="OK"
|
||||
else
|
||||
cycle_status="ERROR"
|
||||
error_seen=1
|
||||
fi
|
||||
|
||||
record="$(python3 - <<PY
|
||||
import json
|
||||
print(json.dumps({
|
||||
"cycle_index": ${cycle_index},
|
||||
"cycle_id": "${cycle_id}",
|
||||
"started_at_utc": "${cycle_started_at}",
|
||||
"ended_at_utc": "${cycle_ended_at}",
|
||||
"run_seconds": int("${run_seconds}"),
|
||||
"discovery_manifest": "${discovery_manifest}",
|
||||
"collector_manifest": "${collector_manifest}",
|
||||
"upload_manifest": "${upload_manifest}",
|
||||
"discovery_exit": int("${discovery_exit}"),
|
||||
"collector_exit": int("${collector_exit}"),
|
||||
"upload_exit": int("${upload_exit}"),
|
||||
"status": "${cycle_status}",
|
||||
"stop_signal": "${STOP_SIGNAL}",
|
||||
}, sort_keys=True))
|
||||
PY
|
||||
)"
|
||||
write_cycle_record "${record}"
|
||||
log "CYCLE ${cycle_index} end id=${cycle_id} status=${cycle_status} discovery_exit=${discovery_exit} collector_exit=${collector_exit} upload_exit=${upload_exit}"
|
||||
|
||||
CURRENT_PHASE="sleep"
|
||||
CURRENT_CYCLE_ID=""
|
||||
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
|
||||
break
|
||||
fi
|
||||
sleep 5 &
|
||||
CURRENT_CHILD_PID="$!"
|
||||
wait "${CURRENT_CHILD_PID}" || true
|
||||
CURRENT_CHILD_PID=""
|
||||
done
|
||||
|
||||
CURRENT_PHASE="finalizing"
|
||||
CURRENT_CYCLE_ID=""
|
||||
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
|
||||
write_final_manifest "INTERRUPTED" "INTERRUPTED" "${STOP_SIGNAL:-signal}"
|
||||
elif [[ "${error_seen}" -eq 1 ]]; then
|
||||
write_final_manifest "ERROR" "ERROR" "cycle_error"
|
||||
else
|
||||
write_final_manifest "COMPLETED_NEEDS_REVIEW" "NEEDS_REVIEW" "elapsed"
|
||||
fi
|
||||
|
||||
log "END soak_id=${SOAK_ID} final_manifest=${FINAL_MANIFEST} status_written=1"
|
||||
39
scripts/run_polymarket_collector_cycle.sh
Executable file
39
scripts/run_polymarket_collector_cycle.sh
Executable file
|
|
@ -0,0 +1,39 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
APP_DIR="${ORDERBOOKS_APP_DIR:-/opt/orderbooks}"
|
||||
PYTHON_BIN="${ORDERBOOKS_PYTHON:-${APP_DIR}/.venv/bin/python}"
|
||||
DATA_DIR="${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}"
|
||||
COLLECTOR_CONFIG="${ORDERBOOKS_COLLECTOR_CONFIG:-/etc/orderbooks/polymarket_collector.vps.yaml}"
|
||||
|
||||
DISCOVERY_DIR="${ORDERBOOKS_DISCOVERY_DIR:-${DATA_DIR}/discovery}"
|
||||
OUTPUT_DIR="${ORDERBOOKS_OUTPUT_DIR:-${DATA_DIR}/raw_orderbooks}"
|
||||
MANIFEST_DIR="${ORDERBOOKS_MANIFEST_DIR:-${DATA_DIR}/manifests}"
|
||||
|
||||
DISCOVERY_JSON="${ORDERBOOKS_DISCOVERY_JSON:-${DISCOVERY_DIR}/polymarket_btc_markets_latest.json}"
|
||||
DISCOVERY_MANIFEST="${ORDERBOOKS_DISCOVERY_MANIFEST:-${DISCOVERY_DIR}/polymarket_btc_markets_manifest.json}"
|
||||
DISCOVERY_MARKDOWN="${ORDERBOOKS_DISCOVERY_MARKDOWN:-${DISCOVERY_DIR}/polymarket_btc_markets.md}"
|
||||
|
||||
DISCOVERY_LIMIT="${ORDERBOOKS_DISCOVERY_LIMIT:-100}"
|
||||
DISCOVERY_MAX_PAGES="${ORDERBOOKS_DISCOVERY_MAX_PAGES:-3}"
|
||||
DISCOVERY_TIMEOUT="${ORDERBOOKS_DISCOVERY_TIMEOUT:-15}"
|
||||
|
||||
cycle_id="$(date -u +%Y%m%dT%H%M%SZ)"
|
||||
COLLECTOR_MANIFEST="${ORDERBOOKS_COLLECTOR_MANIFEST:-${MANIFEST_DIR}/polymarket_orderbook_collector_${cycle_id}.json}"
|
||||
|
||||
mkdir -p "${DISCOVERY_DIR}" "${OUTPUT_DIR}" "${MANIFEST_DIR}"
|
||||
cd "${APP_DIR}"
|
||||
|
||||
"${PYTHON_BIN}" scripts/discover_polymarket_btc_markets.py \
|
||||
--output-json "${DISCOVERY_JSON}" \
|
||||
--manifest "${DISCOVERY_MANIFEST}" \
|
||||
--markdown "${DISCOVERY_MARKDOWN}" \
|
||||
--limit "${DISCOVERY_LIMIT}" \
|
||||
--max-pages "${DISCOVERY_MAX_PAGES}" \
|
||||
--timeout "${DISCOVERY_TIMEOUT}"
|
||||
|
||||
exec "${PYTHON_BIN}" scripts/collect_polymarket_orderbooks.py \
|
||||
--config "${COLLECTOR_CONFIG}" \
|
||||
--discovery-path "${DISCOVERY_JSON}" \
|
||||
--output-dir "${OUTPUT_DIR}" \
|
||||
--manifest-path "${COLLECTOR_MANIFEST}"
|
||||
90
scripts/run_polymarket_collector_loop.sh
Executable file
90
scripts/run_polymarket_collector_loop.sh
Executable file
|
|
@ -0,0 +1,90 @@
|
|||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
APP_DIR="${ORDERBOOKS_APP_DIR:-/app}"
|
||||
MANIFEST_DIR="${ORDERBOOKS_MANIFEST_DIR:-${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}/manifests}"
|
||||
LOOP_SLEEP_SECONDS="${ORDERBOOKS_LOOP_SLEEP_SECONDS:-15}"
|
||||
STOP_REQUESTED=0
|
||||
CHILD_PID=""
|
||||
|
||||
utc_compact() {
|
||||
date -u +%Y%m%dT%H%M%SZ
|
||||
}
|
||||
|
||||
utc_iso() {
|
||||
date -u +%Y-%m-%dT%H:%M:%SZ
|
||||
}
|
||||
|
||||
write_loop_event() {
|
||||
local status="$1"
|
||||
local exit_code="$2"
|
||||
local message="$3"
|
||||
local path="${MANIFEST_DIR%/}/collector_loop_$(utc_compact).json"
|
||||
mkdir -p "${MANIFEST_DIR}"
|
||||
PYTHONDONTWRITEBYTECODE=1 python3 - "$path" "$status" "$exit_code" "$message" <<'PY_LOOP_EVENT'
|
||||
import json
|
||||
import sys
|
||||
import datetime as dt
|
||||
from pathlib import Path
|
||||
|
||||
path = Path(sys.argv[1])
|
||||
status = sys.argv[2]
|
||||
exit_code = int(sys.argv[3])
|
||||
message = sys.argv[4]
|
||||
now = dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z')
|
||||
path.write_text(json.dumps({
|
||||
'schema_name': 'collector_loop_event',
|
||||
'schema_version': 1,
|
||||
'written_at_utc': now,
|
||||
'status': status,
|
||||
'exit_code': exit_code,
|
||||
'message': message,
|
||||
}, indent=2, sort_keys=True) + '\n', encoding='utf-8')
|
||||
PY_LOOP_EVENT
|
||||
}
|
||||
|
||||
request_stop() {
|
||||
STOP_REQUESTED=1
|
||||
if [[ -n "${CHILD_PID}" ]] && kill -0 "${CHILD_PID}" >/dev/null 2>&1; then
|
||||
kill -TERM "${CHILD_PID}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
trap request_stop INT TERM
|
||||
|
||||
mkdir -p "${MANIFEST_DIR}"
|
||||
cd "${APP_DIR}" || exit 1
|
||||
|
||||
echo "collector loop started at $(utc_iso)"
|
||||
|
||||
while [[ "${STOP_REQUESTED}" -eq 0 ]]; do
|
||||
cycle_started="$(utc_iso)"
|
||||
echo "collector cycle starting at ${cycle_started}"
|
||||
|
||||
/bin/bash scripts/run_polymarket_collector_cycle.sh &
|
||||
CHILD_PID="$!"
|
||||
wait "${CHILD_PID}"
|
||||
cycle_exit="$?"
|
||||
CHILD_PID=""
|
||||
|
||||
if [[ "${STOP_REQUESTED}" -ne 0 ]]; then
|
||||
write_loop_event "INTERRUPTED" "${cycle_exit}" "collector loop received stop request during or after cycle"
|
||||
break
|
||||
fi
|
||||
|
||||
if [[ "${cycle_exit}" -ne 0 ]]; then
|
||||
write_loop_event "CYCLE_FAILED" "${cycle_exit}" "collector cycle exited nonzero; loop will continue after sleep"
|
||||
echo "collector cycle failed with exit ${cycle_exit}; continuing after ${LOOP_SLEEP_SECONDS}s" >&2
|
||||
else
|
||||
echo "collector cycle completed at $(utc_iso)"
|
||||
fi
|
||||
|
||||
for ((i = 0; i < LOOP_SLEEP_SECONDS; i++)); do
|
||||
if [[ "${STOP_REQUESTED}" -ne 0 ]]; then
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
done
|
||||
|
||||
echo "collector loop stopped at $(utc_iso)"
|
||||
462
scripts/upload_archive_rclone.sh
Executable file
462
scripts/upload_archive_rclone.sh
Executable file
|
|
@ -0,0 +1,462 @@
|
|||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
SCRIPT_NAME="orderbooks_rclone_uploader"
|
||||
SCRIPT_VERSION="0.1.0"
|
||||
|
||||
MODE="dry-run"
|
||||
CLEANUP_AFTER_VERIFY=0
|
||||
DATA_DIR="${ORDERBOOKS_UPLOAD_DATA_DIR:-${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}}"
|
||||
RAW_DIR="${ORDERBOOKS_UPLOAD_RAW_DIR:-}"
|
||||
SOURCE_MANIFEST_DIR="${ORDERBOOKS_UPLOAD_SOURCE_MANIFEST_DIR:-}"
|
||||
MANIFEST_DIR="${ORDERBOOKS_UPLOAD_MANIFEST_DIR:-}"
|
||||
MANIFEST_PATH="${ORDERBOOKS_UPLOAD_MANIFEST_PATH:-}"
|
||||
DEST="${ORDERBOOKS_RCLONE_DEST:-}"
|
||||
RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-rclone}"
|
||||
MIN_AGE_SECONDS="${ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS:-600}"
|
||||
RETENTION_DAYS="${ORDERBOOKS_UPLOAD_RETENTION_DAYS:-7}"
|
||||
TRANSFERS="${ORDERBOOKS_RCLONE_TRANSFERS:-4}"
|
||||
CHECKERS="${ORDERBOOKS_RCLONE_CHECKERS:-8}"
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: scripts/upload_archive_rclone.sh [options]
|
||||
|
||||
Uploads closed raw collector archive files and manifests with rclone.
|
||||
Default mode is dry-run. Real upload requires --execute and a destination.
|
||||
|
||||
Options:
|
||||
--dry-run Plan and run rclone copy with --dry-run (default).
|
||||
--execute Run real rclone copy and rclone check.
|
||||
--cleanup-after-verify Delete uploaded local files older than retention only after verification.
|
||||
--data-dir DIR Base data directory. Default: /var/lib/orderbooks.
|
||||
--raw-dir DIR Raw collector output directory. Default: DATA_DIR/raw_orderbooks.
|
||||
--source-manifest-dir DIR Source collector manifest directory. Default: DATA_DIR/manifests.
|
||||
--manifest-dir DIR Upload manifest output directory. Default: DATA_DIR/manifests.
|
||||
--manifest-path PATH Exact upload manifest path.
|
||||
--dest REMOTE:PATH rclone destination. Or set ORDERBOOKS_RCLONE_DEST.
|
||||
--min-age-seconds N Skip files modified within N seconds. Default: 600.
|
||||
--retention-days N Keep at least N days locally. Default: 7.
|
||||
--rclone-bin PATH rclone binary path. Default: rclone.
|
||||
--help Show this help.
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--dry-run)
|
||||
MODE="dry-run"
|
||||
shift
|
||||
;;
|
||||
--execute)
|
||||
MODE="execute"
|
||||
shift
|
||||
;;
|
||||
--cleanup-after-verify)
|
||||
CLEANUP_AFTER_VERIFY=1
|
||||
shift
|
||||
;;
|
||||
--data-dir)
|
||||
DATA_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--raw-dir)
|
||||
RAW_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--source-manifest-dir)
|
||||
SOURCE_MANIFEST_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--manifest-dir)
|
||||
MANIFEST_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--manifest-path)
|
||||
MANIFEST_PATH="$2"
|
||||
shift 2
|
||||
;;
|
||||
--dest)
|
||||
DEST="$2"
|
||||
shift 2
|
||||
;;
|
||||
--min-age-seconds)
|
||||
MIN_AGE_SECONDS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--retention-days)
|
||||
RETENTION_DAYS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--rclone-bin)
|
||||
RCLONE_BIN="$2"
|
||||
shift 2
|
||||
;;
|
||||
--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -z "${RAW_DIR}" ]]; then
|
||||
RAW_DIR="${DATA_DIR%/}/raw_orderbooks"
|
||||
fi
|
||||
if [[ -z "${SOURCE_MANIFEST_DIR}" ]]; then
|
||||
SOURCE_MANIFEST_DIR="${DATA_DIR%/}/manifests"
|
||||
fi
|
||||
if [[ -z "${MANIFEST_DIR}" ]]; then
|
||||
MANIFEST_DIR="${DATA_DIR%/}/manifests"
|
||||
fi
|
||||
|
||||
STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
|
||||
if [[ -z "${MANIFEST_PATH}" ]]; then
|
||||
MANIFEST_PATH="${MANIFEST_DIR%/}/upload_archive_${RUN_ID}.json"
|
||||
fi
|
||||
|
||||
TMPDIR="$(mktemp -d)"
|
||||
trap 'rm -rf "${TMPDIR}"' EXIT
|
||||
|
||||
PLAN_PATH="${TMPDIR}/plan.json"
|
||||
RCLONE_COPY_LOG="${TMPDIR}/rclone_copy.log"
|
||||
RCLONE_CHECK_LOG="${TMPDIR}/rclone_check.log"
|
||||
CLEANUP_PATH="${TMPDIR}/cleanup.json"
|
||||
STAGING_DIR="${TMPDIR}/stage"
|
||||
|
||||
mkdir -p "$(dirname "${MANIFEST_PATH}")" "${STAGING_DIR}"
|
||||
|
||||
python3 - "$DATA_DIR" "$RAW_DIR" "$SOURCE_MANIFEST_DIR" "$MANIFEST_PATH" "$MIN_AGE_SECONDS" "$STAGING_DIR" "$PLAN_PATH" <<'PY'
|
||||
import datetime as dt
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
data_dir = Path(sys.argv[1])
|
||||
raw_dir = Path(sys.argv[2])
|
||||
source_manifest_dir = Path(sys.argv[3])
|
||||
manifest_path = Path(sys.argv[4]).resolve()
|
||||
min_age_seconds = int(sys.argv[5])
|
||||
staging_dir = Path(sys.argv[6])
|
||||
plan_path = Path(sys.argv[7])
|
||||
now = dt.datetime.now(dt.UTC)
|
||||
|
||||
def iso_z_from_ts(ts: float) -> str:
|
||||
return dt.datetime.fromtimestamp(ts, dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
||||
|
||||
def sha256_file(path: Path) -> str:
|
||||
digest = hashlib.sha256()
|
||||
with path.open("rb") as handle:
|
||||
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
||||
digest.update(chunk)
|
||||
return digest.hexdigest()
|
||||
|
||||
def rel_for(path: Path) -> str:
|
||||
resolved = path.resolve()
|
||||
try:
|
||||
return resolved.relative_to(data_dir.resolve()).as_posix()
|
||||
except ValueError:
|
||||
return resolved.name
|
||||
|
||||
def iter_files(root: Path):
|
||||
if not root.exists():
|
||||
return
|
||||
for path in sorted(root.rglob("*")):
|
||||
if path.is_file():
|
||||
yield path
|
||||
|
||||
selected = []
|
||||
skipped = []
|
||||
warnings = []
|
||||
seen = set()
|
||||
|
||||
for root, kind in [(raw_dir, "raw"), (source_manifest_dir, "manifest")]:
|
||||
if not root.exists():
|
||||
warnings.append(f"{kind} source directory does not exist: {root}")
|
||||
continue
|
||||
for path in iter_files(root):
|
||||
resolved = path.resolve()
|
||||
if resolved in seen:
|
||||
continue
|
||||
seen.add(resolved)
|
||||
rel = rel_for(path)
|
||||
stat = path.stat()
|
||||
age_seconds = max(0, int(now.timestamp() - stat.st_mtime))
|
||||
base = {
|
||||
"local_path": str(path),
|
||||
"relative_path": rel,
|
||||
"kind": kind,
|
||||
"bytes": stat.st_size,
|
||||
"mtime_utc": iso_z_from_ts(stat.st_mtime),
|
||||
"age_seconds": age_seconds,
|
||||
}
|
||||
if resolved == manifest_path:
|
||||
skipped.append({**base, "reason": "current_upload_manifest"})
|
||||
continue
|
||||
if age_seconds < min_age_seconds:
|
||||
skipped.append({**base, "reason": "modified_within_min_age_seconds"})
|
||||
continue
|
||||
checksum = sha256_file(path)
|
||||
staged_path = staging_dir / rel
|
||||
staged_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(path, staged_path)
|
||||
selected.append({**base, "sha256": checksum, "staged_path": str(staged_path)})
|
||||
|
||||
plan = {
|
||||
"selected_files": selected,
|
||||
"skipped_files": skipped,
|
||||
"warnings": warnings,
|
||||
}
|
||||
plan_path.write_text(json.dumps(plan, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||
PY
|
||||
|
||||
RCLONE_AVAILABLE=0
|
||||
RCLONE_VERSION=""
|
||||
if command -v "${RCLONE_BIN}" >/dev/null 2>&1; then
|
||||
RCLONE_AVAILABLE=1
|
||||
RCLONE_VERSION="$("${RCLONE_BIN}" version 2>/dev/null | head -n 1 || true)"
|
||||
fi
|
||||
|
||||
DEST_CONFIGURED=0
|
||||
if [[ -n "${DEST}" ]]; then
|
||||
DEST_CONFIGURED=1
|
||||
fi
|
||||
|
||||
COPY_EXIT_CODE=""
|
||||
CHECK_EXIT_CODE=""
|
||||
COPY_ATTEMPTED=0
|
||||
CHECK_ATTEMPTED=0
|
||||
OPERATION_STATUS="PLANNED"
|
||||
GATE_STATUS="BLOCKED_REAL_UPLOAD"
|
||||
|
||||
if [[ "${DEST_CONFIGURED}" -eq 0 ]]; then
|
||||
OPERATION_STATUS="BLOCKED_DEST_MISSING"
|
||||
elif [[ "${RCLONE_AVAILABLE}" -eq 0 ]]; then
|
||||
OPERATION_STATUS="BLOCKED_RCLONE_UNAVAILABLE"
|
||||
else
|
||||
COPY_ATTEMPTED=1
|
||||
copy_args=(copy "${STAGING_DIR}/" "${DEST%/}/" --checksum --transfers "${TRANSFERS}" --checkers "${CHECKERS}")
|
||||
if [[ "${MODE}" == "dry-run" ]]; then
|
||||
copy_args+=(--dry-run)
|
||||
fi
|
||||
"${RCLONE_BIN}" "${copy_args[@]}" >"${RCLONE_COPY_LOG}" 2>&1
|
||||
COPY_EXIT_CODE=$?
|
||||
if [[ "${COPY_EXIT_CODE}" -eq 0 && "${MODE}" == "dry-run" ]]; then
|
||||
OPERATION_STATUS="DRY_RUN_PASS"
|
||||
elif [[ "${COPY_EXIT_CODE}" -eq 0 ]]; then
|
||||
CHECK_ATTEMPTED=1
|
||||
"${RCLONE_BIN}" check "${STAGING_DIR}/" "${DEST%/}/" --one-way --checksum >"${RCLONE_CHECK_LOG}" 2>&1
|
||||
CHECK_EXIT_CODE=$?
|
||||
if [[ "${CHECK_EXIT_CODE}" -eq 0 ]]; then
|
||||
OPERATION_STATUS="UPLOAD_VERIFIED"
|
||||
GATE_STATUS="PASS"
|
||||
else
|
||||
OPERATION_STATUS="VERIFY_FAILED"
|
||||
GATE_STATUS="FAIL"
|
||||
fi
|
||||
else
|
||||
OPERATION_STATUS="COPY_FAILED"
|
||||
GATE_STATUS="FAIL"
|
||||
fi
|
||||
fi
|
||||
|
||||
python3 - "$PLAN_PATH" "$CLEANUP_PATH" "$MODE" "$CLEANUP_AFTER_VERIFY" "$RETENTION_DAYS" "$OPERATION_STATUS" "$GATE_STATUS" <<'PY'
|
||||
import datetime as dt
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
plan_path = Path(sys.argv[1])
|
||||
cleanup_path = Path(sys.argv[2])
|
||||
mode = sys.argv[3]
|
||||
cleanup_after_verify = sys.argv[4] == "1"
|
||||
retention_days = int(sys.argv[5])
|
||||
operation_status = sys.argv[6]
|
||||
gate_status = sys.argv[7]
|
||||
plan = json.loads(plan_path.read_text())
|
||||
now = dt.datetime.now(dt.UTC)
|
||||
cutoff = now - dt.timedelta(days=retention_days)
|
||||
retained = []
|
||||
deleted = []
|
||||
|
||||
if mode == "execute" and cleanup_after_verify and operation_status == "UPLOAD_VERIFIED":
|
||||
for item in plan["selected_files"]:
|
||||
path = Path(item["local_path"])
|
||||
mtime = dt.datetime.fromtimestamp(path.stat().st_mtime, dt.UTC) if path.exists() else now
|
||||
if mtime < cutoff and path.exists():
|
||||
path.unlink()
|
||||
deleted.append({**item, "deleted_at_utc": now.replace(microsecond=0).isoformat().replace("+00:00", "Z")})
|
||||
else:
|
||||
retained.append({**item, "reason": "within_retention_window" if mtime >= cutoff else "missing_before_cleanup"})
|
||||
else:
|
||||
reason = "cleanup_not_requested"
|
||||
if mode != "execute":
|
||||
reason = "dry_run"
|
||||
elif operation_status != "UPLOAD_VERIFIED":
|
||||
reason = "not_verified"
|
||||
for item in plan["selected_files"]:
|
||||
retained.append({**item, "reason": reason})
|
||||
|
||||
cleanup_path.write_text(
|
||||
json.dumps({"retained_local_files": retained, "deleted_local_files": deleted}, indent=2, sort_keys=True) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
PY
|
||||
|
||||
ENDED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
|
||||
export SCRIPT_NAME SCRIPT_VERSION STARTED_AT ENDED_AT
|
||||
export MODE OPERATION_STATUS GATE_STATUS
|
||||
export RCLONE_BIN RCLONE_AVAILABLE RCLONE_VERSION DEST
|
||||
export COPY_ATTEMPTED CHECK_ATTEMPTED COPY_EXIT_CODE CHECK_EXIT_CODE
|
||||
export DATA_DIR RAW_DIR SOURCE_MANIFEST_DIR MIN_AGE_SECONDS RETENTION_DAYS CLEANUP_AFTER_VERIFY
|
||||
|
||||
python3 - "$PLAN_PATH" "$CLEANUP_PATH" "$MANIFEST_PATH" <<'PY'
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
plan = json.loads(Path(sys.argv[1]).read_text())
|
||||
cleanup = json.loads(Path(sys.argv[2]).read_text())
|
||||
manifest_path = Path(sys.argv[3])
|
||||
|
||||
mode = os.environ["MODE"]
|
||||
operation_status = os.environ["OPERATION_STATUS"]
|
||||
gate_status = os.environ["GATE_STATUS"]
|
||||
copy_attempted = os.environ["COPY_ATTEMPTED"] == "1"
|
||||
check_attempted = os.environ["CHECK_ATTEMPTED"] == "1"
|
||||
copy_exit_code = os.environ["COPY_EXIT_CODE"]
|
||||
check_exit_code = os.environ["CHECK_EXIT_CODE"]
|
||||
dest = os.environ["DEST"]
|
||||
|
||||
def public_item(item):
|
||||
public = dict(item)
|
||||
public.pop("staged_path", None)
|
||||
return public
|
||||
|
||||
selected = [public_item(item) for item in plan["selected_files"]]
|
||||
skipped = [public_item(item) for item in plan["skipped_files"]]
|
||||
retained_local = [public_item(item) for item in cleanup["retained_local_files"]]
|
||||
deleted_local = [public_item(item) for item in cleanup["deleted_local_files"]]
|
||||
attempted_files = selected if copy_attempted else []
|
||||
uploaded_files = selected if mode == "execute" and operation_status in {"UPLOAD_VERIFIED", "VERIFY_FAILED"} else []
|
||||
verified_files = selected if mode == "execute" and operation_status == "UPLOAD_VERIFIED" else []
|
||||
dry_run_files = selected if mode == "dry-run" and operation_status == "DRY_RUN_PASS" else []
|
||||
|
||||
manifest = {
|
||||
"schema_name": "upload_archive_manifest",
|
||||
"schema_version": 1,
|
||||
"checkpoint_id": 7,
|
||||
"checkpoint_name": "Google Drive Offload",
|
||||
"uploader": {
|
||||
"name": os.environ["SCRIPT_NAME"],
|
||||
"version": os.environ["SCRIPT_VERSION"],
|
||||
},
|
||||
"started_at_utc": os.environ["STARTED_AT"],
|
||||
"ended_at_utc": os.environ["ENDED_AT"],
|
||||
"command_mode": mode,
|
||||
"operation_status": operation_status,
|
||||
"gate_status": gate_status,
|
||||
"rclone": {
|
||||
"binary": os.environ["RCLONE_BIN"],
|
||||
"available": os.environ["RCLONE_AVAILABLE"] == "1",
|
||||
"version": os.environ["RCLONE_VERSION"],
|
||||
"destination_configured": bool(dest),
|
||||
"destination": dest if dest else None,
|
||||
"copy_attempted": copy_attempted,
|
||||
"copy_exit_code": int(copy_exit_code) if copy_exit_code else None,
|
||||
"check_attempted": check_attempted,
|
||||
"check_exit_code": int(check_exit_code) if check_exit_code else None,
|
||||
},
|
||||
"config": {
|
||||
"data_dir": os.environ["DATA_DIR"],
|
||||
"raw_dir": os.environ["RAW_DIR"],
|
||||
"source_manifest_dir": os.environ["SOURCE_MANIFEST_DIR"],
|
||||
"manifest_path": str(manifest_path),
|
||||
"min_age_seconds": int(os.environ["MIN_AGE_SECONDS"]),
|
||||
"retention_days": int(os.environ["RETENTION_DAYS"]),
|
||||
"cleanup_after_verify": os.environ["CLEANUP_AFTER_VERIFY"] == "1",
|
||||
},
|
||||
"planned_files": selected,
|
||||
"attempted_files": attempted_files,
|
||||
"dry_run_files": dry_run_files,
|
||||
"uploaded_files": uploaded_files,
|
||||
"verified_files": verified_files,
|
||||
"skipped_open_or_recent_files": [
|
||||
item for item in skipped if item.get("reason") == "modified_within_min_age_seconds"
|
||||
],
|
||||
"skipped_files": skipped,
|
||||
"retained_local_files": retained_local,
|
||||
"deleted_local_files": deleted_local,
|
||||
"counts": {
|
||||
"planned": len(selected),
|
||||
"attempted": len(attempted_files),
|
||||
"dry_run": len(dry_run_files),
|
||||
"uploaded": len(uploaded_files),
|
||||
"verified": len(verified_files),
|
||||
"skipped": len(skipped),
|
||||
"retained_local": len(retained_local),
|
||||
"deleted_local": len(deleted_local),
|
||||
},
|
||||
"warnings": plan["warnings"],
|
||||
"known_gaps": [
|
||||
"A dry-run does not prove remote write access.",
|
||||
"Real upload requires a configured rclone remote outside the repository.",
|
||||
"Local files are retained unless --cleanup-after-verify is used after successful verification.",
|
||||
],
|
||||
}
|
||||
|
||||
if operation_status == "BLOCKED_RCLONE_UNAVAILABLE":
|
||||
manifest["warnings"].append("rclone binary was not available; copy and verification were not attempted.")
|
||||
if operation_status == "BLOCKED_DEST_MISSING":
|
||||
manifest["warnings"].append("No rclone destination was configured; set --dest or ORDERBOOKS_RCLONE_DEST.")
|
||||
if mode == "dry-run":
|
||||
manifest["warnings"].append("Dry-run mode does not perform a real upload; checkpoint real-upload gate remains blocked.")
|
||||
|
||||
manifest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"gate_status": gate_status,
|
||||
"operation_status": operation_status,
|
||||
"manifest_path": str(manifest_path),
|
||||
"planned_files": len(selected),
|
||||
"attempted_files": len(attempted_files),
|
||||
"uploaded_files": len(uploaded_files),
|
||||
"verified_files": len(verified_files),
|
||||
"skipped_files": len(plan["skipped_files"]),
|
||||
},
|
||||
indent=2,
|
||||
sort_keys=True,
|
||||
)
|
||||
)
|
||||
PY
|
||||
|
||||
case "${OPERATION_STATUS}" in
|
||||
UPLOAD_VERIFIED|DRY_RUN_PASS)
|
||||
exit 0
|
||||
;;
|
||||
BLOCKED_DEST_MISSING)
|
||||
echo "No rclone destination configured. Set --dest or ORDERBOOKS_RCLONE_DEST." >&2
|
||||
exit 2
|
||||
;;
|
||||
BLOCKED_RCLONE_UNAVAILABLE)
|
||||
echo "rclone is not available. Install rclone before running dry-run or execute mode." >&2
|
||||
exit 3
|
||||
;;
|
||||
*)
|
||||
echo "Upload operation failed with status: ${OPERATION_STATUS}" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
285
scripts/vps_preflight_check.sh
Executable file
285
scripts/vps_preflight_check.sh
Executable file
|
|
@ -0,0 +1,285 @@
|
|||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
APP_DIR="$(pwd)"
|
||||
PYTHON_BIN="${ORDERBOOKS_PYTHON:-python3}"
|
||||
RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-rclone}"
|
||||
RCLONE_REMOTE="${ORDERBOOKS_RCLONE_DEST:-}"
|
||||
DATA_DIR=""
|
||||
MANIFEST_DIR=""
|
||||
LOG_DIR=""
|
||||
MIN_FREE_GIB="${ORDERBOOKS_PREFLIGHT_MIN_FREE_GIB:-5}"
|
||||
REMOTE_TIMEOUT_SECONDS="${ORDERBOOKS_PREFLIGHT_REMOTE_TIMEOUT_SECONDS:-30}"
|
||||
|
||||
FAILURES=0
|
||||
WARNINGS=0
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: scripts/vps_preflight_check.sh [options]
|
||||
|
||||
Read-only VPS cutover preflight for the Polymarket order-book collector.
|
||||
|
||||
Default behavior checks the repository, local tooling, unit syntax, disk space,
|
||||
and rclone availability. It does not print rclone config and does not require
|
||||
secrets.
|
||||
|
||||
Options:
|
||||
--app-dir DIR Repository checkout path. Default: current directory.
|
||||
--python-bin PATH Python interpreter. Default: ORDERBOOKS_PYTHON or python3.
|
||||
--rclone-bin PATH rclone binary. Default: ORDERBOOKS_RCLONE_BIN or rclone.
|
||||
--rclone-remote REMOTE Optional remote/path to check read-only, e.g. gdrive:orderbooks/polymarket.
|
||||
--data-dir DIR Optional target data directory to create/check writable.
|
||||
--manifest-dir DIR Optional target manifest directory to create/check writable.
|
||||
--log-dir DIR Optional target log directory to create/check writable.
|
||||
--min-free-gib N Minimum free GiB for checked filesystems. Default: 5.
|
||||
--remote-timeout-seconds N Timeout for rclone remote read check. Default: 30.
|
||||
--help Show this help.
|
||||
|
||||
Directory options intentionally create missing directories before checking
|
||||
writability. Omit them for a repo-only read-only check.
|
||||
EOF
|
||||
}
|
||||
|
||||
log_pass() { printf 'PASS %s\n' "$*"; }
|
||||
log_info() { printf 'INFO %s\n' "$*"; }
|
||||
log_warn() { WARNINGS=$((WARNINGS + 1)); printf 'WARN %s\n' "$*"; }
|
||||
log_fail() { FAILURES=$((FAILURES + 1)); printf 'FAIL %s\n' "$*"; }
|
||||
run_quiet() { "$@" >/dev/null 2>&1; }
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--app-dir) APP_DIR="$2"; shift 2 ;;
|
||||
--python-bin) PYTHON_BIN="$2"; shift 2 ;;
|
||||
--rclone-bin) RCLONE_BIN="$2"; shift 2 ;;
|
||||
--rclone-remote) RCLONE_REMOTE="$2"; shift 2 ;;
|
||||
--data-dir) DATA_DIR="$2"; shift 2 ;;
|
||||
--manifest-dir) MANIFEST_DIR="$2"; shift 2 ;;
|
||||
--log-dir) LOG_DIR="$2"; shift 2 ;;
|
||||
--min-free-gib) MIN_FREE_GIB="$2"; shift 2 ;;
|
||||
--remote-timeout-seconds) REMOTE_TIMEOUT_SECONDS="$2"; shift 2 ;;
|
||||
--help) usage; exit 0 ;;
|
||||
*) log_fail "unknown argument: $1"; usage >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
APP_DIR="${APP_DIR%/}"
|
||||
if [[ ! -d "${APP_DIR}" ]]; then
|
||||
log_fail "app directory does not exist: ${APP_DIR}"
|
||||
printf 'SUMMARY failures=%s warnings=%s\n' "${FAILURES}" "${WARNINGS}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd "${APP_DIR}" || {
|
||||
log_fail "could not cd to app directory: ${APP_DIR}"
|
||||
printf 'SUMMARY failures=%s warnings=%s\n' "${FAILURES}" "${WARNINGS}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
check_python() {
|
||||
if command -v "${PYTHON_BIN}" >/dev/null 2>&1; then
|
||||
version="$("${PYTHON_BIN}" --version 2>&1 || true)"
|
||||
log_pass "python available: ${PYTHON_BIN} (${version})"
|
||||
else
|
||||
log_fail "python not found: ${PYTHON_BIN}"
|
||||
fi
|
||||
}
|
||||
|
||||
check_required_files() {
|
||||
local missing=0 file
|
||||
local required=(
|
||||
"scripts/discover_polymarket_btc_markets.py"
|
||||
"scripts/collect_polymarket_orderbooks.py"
|
||||
"scripts/normalize_polymarket_orderbooks.py"
|
||||
"scripts/run_polymarket_collector_cycle.sh"
|
||||
"scripts/upload_archive_rclone.sh"
|
||||
"scripts/vps_runtime_smoke_check.sh"
|
||||
"config/polymarket_collector.vps.example.yaml"
|
||||
"docs/VPS_DEPLOYMENT.md"
|
||||
"docs/GOOGLE_DRIVE_OFFLOAD.md"
|
||||
"systemd/polymarket-orderbook-collector.service"
|
||||
"systemd/polymarket-orderbook-uploader.service"
|
||||
"systemd/polymarket-orderbook-uploader.timer"
|
||||
)
|
||||
for file in "${required[@]}"; do
|
||||
if [[ -f "${file}" ]]; then
|
||||
log_pass "required file exists: ${file}"
|
||||
else
|
||||
missing=1
|
||||
log_fail "required file missing: ${file}"
|
||||
fi
|
||||
done
|
||||
return "${missing}"
|
||||
}
|
||||
|
||||
check_python_compile() {
|
||||
if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then
|
||||
log_fail "cannot compile Python scripts because Python is missing"
|
||||
return
|
||||
fi
|
||||
if run_quiet "${PYTHON_BIN}" - <<'PY'
|
||||
from pathlib import Path
|
||||
|
||||
paths = [
|
||||
Path("scripts/discover_polymarket_btc_markets.py"),
|
||||
Path("scripts/collect_polymarket_orderbooks.py"),
|
||||
Path("scripts/normalize_polymarket_orderbooks.py"),
|
||||
]
|
||||
for path in paths:
|
||||
source = path.read_text(encoding="utf-8")
|
||||
compile(source, str(path), "exec")
|
||||
PY
|
||||
then
|
||||
log_pass "collector/discovery/normalization Python scripts compile without bytecode writes"
|
||||
else
|
||||
log_fail "Python no-bytecode compile check failed"
|
||||
fi
|
||||
}
|
||||
|
||||
check_shell_syntax() {
|
||||
local failed=0 script
|
||||
for script in scripts/*.sh; do
|
||||
[[ -f "${script}" ]] || continue
|
||||
if bash -n "${script}" >/dev/null 2>&1; then
|
||||
log_pass "bash syntax ok: ${script}"
|
||||
else
|
||||
failed=1
|
||||
log_fail "bash syntax failed: ${script}"
|
||||
fi
|
||||
done
|
||||
return "${failed}"
|
||||
}
|
||||
|
||||
check_systemd_units() {
|
||||
local units=(
|
||||
"systemd/polymarket-orderbook-collector.service"
|
||||
"systemd/polymarket-orderbook-uploader.service"
|
||||
"systemd/polymarket-orderbook-uploader.timer"
|
||||
)
|
||||
if command -v systemd-analyze >/dev/null 2>&1; then
|
||||
if systemd-analyze verify "${units[@]}" >/dev/null 2>&1; then
|
||||
log_pass "systemd units parse with systemd-analyze"
|
||||
else
|
||||
log_fail "systemd-analyze verify failed for one or more units"
|
||||
fi
|
||||
else
|
||||
log_warn "systemd-analyze unavailable; skipped unit parse check"
|
||||
fi
|
||||
}
|
||||
|
||||
remote_name_from_dest() {
|
||||
local dest="$1"
|
||||
case "${dest}" in
|
||||
*:*) printf '%s:\n' "${dest%%:*}" ;;
|
||||
*) printf '\n' ;;
|
||||
esac
|
||||
}
|
||||
|
||||
run_with_timeout() {
|
||||
if command -v timeout >/dev/null 2>&1; then
|
||||
timeout "${REMOTE_TIMEOUT_SECONDS}" "$@"
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
}
|
||||
|
||||
check_rclone() {
|
||||
if [[ -x "${RCLONE_BIN}" ]] || command -v "${RCLONE_BIN}" >/dev/null 2>&1; then
|
||||
version="$("${RCLONE_BIN}" version 2>/dev/null | head -n 1 || true)"
|
||||
log_pass "rclone available: ${RCLONE_BIN} (${version})"
|
||||
else
|
||||
log_fail "rclone not found: ${RCLONE_BIN}"
|
||||
return
|
||||
fi
|
||||
|
||||
if [[ -z "${RCLONE_REMOTE}" ]]; then
|
||||
log_info "no rclone remote provided; skipped remote access check"
|
||||
return
|
||||
fi
|
||||
|
||||
local remote_name
|
||||
remote_name="$(remote_name_from_dest "${RCLONE_REMOTE}")"
|
||||
if [[ -z "${remote_name}" ]]; then
|
||||
log_fail "rclone remote must include a remote name ending in ':': ${RCLONE_REMOTE}"
|
||||
return
|
||||
fi
|
||||
|
||||
if "${RCLONE_BIN}" listremotes 2>/dev/null | grep -Fxq "${remote_name}"; then
|
||||
log_pass "rclone remote is configured: ${remote_name}"
|
||||
else
|
||||
log_fail "rclone remote is not configured or not visible to this user: ${remote_name}"
|
||||
return
|
||||
fi
|
||||
|
||||
if run_with_timeout "${RCLONE_BIN}" lsf --max-depth 1 "${RCLONE_REMOTE}" >/dev/null 2>&1; then
|
||||
log_pass "rclone remote read check succeeded without printing config: ${RCLONE_REMOTE}"
|
||||
else
|
||||
log_fail "rclone remote read check failed or timed out: ${RCLONE_REMOTE}"
|
||||
fi
|
||||
}
|
||||
|
||||
check_target_dir() {
|
||||
local label="$1" path="$2"
|
||||
if [[ -z "${path}" ]]; then
|
||||
log_info "no ${label} directory provided; skipped create/write check"
|
||||
return
|
||||
fi
|
||||
if mkdir -p "${path}" >/dev/null 2>&1 && [[ -d "${path}" && -w "${path}" ]]; then
|
||||
log_pass "${label} directory exists and is writable: ${path}"
|
||||
else
|
||||
log_fail "${label} directory cannot be created or is not writable: ${path}"
|
||||
fi
|
||||
}
|
||||
|
||||
check_disk_free() {
|
||||
local target="$1" label="$2" available_kib min_kib
|
||||
if [[ ! -e "${target}" ]]; then
|
||||
log_warn "disk target does not exist, skipping ${label}: ${target}"
|
||||
return
|
||||
fi
|
||||
available_kib="$(df -Pk "${target}" | awk 'NR==2 {print $4}')"
|
||||
min_kib=$((MIN_FREE_GIB * 1024 * 1024))
|
||||
if [[ -n "${available_kib}" && "${available_kib}" -ge "${min_kib}" ]]; then
|
||||
log_pass "disk free ok for ${label}: available_kib=${available_kib} min_gib=${MIN_FREE_GIB}"
|
||||
else
|
||||
log_fail "disk free below threshold for ${label}: available_kib=${available_kib:-unknown} min_gib=${MIN_FREE_GIB}"
|
||||
fi
|
||||
}
|
||||
|
||||
check_secret_requirements() {
|
||||
local files=(
|
||||
"config/polymarket_collector.vps.example.yaml"
|
||||
"systemd/polymarket-orderbook-collector.service"
|
||||
"systemd/polymarket-orderbook-uploader.service"
|
||||
"systemd/polymarket-orderbook-uploader.timer"
|
||||
"scripts/run_polymarket_collector_cycle.sh"
|
||||
"scripts/upload_archive_rclone.sh"
|
||||
)
|
||||
if grep -E -i '(api[_-]?key|private[_-]?key|mnemonic|wallet|password|client[_-]?secret|access[_-]?token|refresh[_-]?token)' "${files[@]}" >/dev/null 2>&1; then
|
||||
log_fail "secret-like credential requirement found in runtime config, units, or scripts"
|
||||
else
|
||||
log_pass "no API keys, private keys, mnemonics, wallets, or passwords are required by runtime files"
|
||||
fi
|
||||
log_info "rclone credentials, if used, must remain machine-local outside the repository"
|
||||
}
|
||||
|
||||
check_python
|
||||
check_required_files
|
||||
check_python_compile
|
||||
check_shell_syntax
|
||||
check_systemd_units
|
||||
check_rclone
|
||||
check_target_dir "data" "${DATA_DIR}"
|
||||
check_target_dir "manifest" "${MANIFEST_DIR}"
|
||||
check_target_dir "log" "${LOG_DIR}"
|
||||
check_disk_free "." "repository"
|
||||
if [[ -n "${DATA_DIR}" && -d "${DATA_DIR}" ]]; then
|
||||
check_disk_free "${DATA_DIR}" "data directory"
|
||||
fi
|
||||
check_secret_requirements
|
||||
|
||||
printf 'SUMMARY failures=%s warnings=%s\n' "${FAILURES}" "${WARNINGS}"
|
||||
if [[ "${FAILURES}" -eq 0 ]]; then
|
||||
exit 0
|
||||
fi
|
||||
exit 1
|
||||
279
scripts/vps_runtime_smoke_check.sh
Executable file
279
scripts/vps_runtime_smoke_check.sh
Executable file
|
|
@ -0,0 +1,279 @@
|
|||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
APP_DIR="${ORDERBOOKS_APP_DIR:-/opt/orderbooks}"
|
||||
DATA_DIR="${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}"
|
||||
RAW_DIR="${ORDERBOOKS_OUTPUT_DIR:-${DATA_DIR}/raw_orderbooks}"
|
||||
MANIFEST_DIR="${ORDERBOOKS_MANIFEST_DIR:-${DATA_DIR}/manifests}"
|
||||
COLLECTOR_SERVICE="${ORDERBOOKS_COLLECTOR_SERVICE:-polymarket-orderbook-collector.service}"
|
||||
UPLOADER_SERVICE="${ORDERBOOKS_UPLOADER_SERVICE:-polymarket-orderbook-uploader.service}"
|
||||
WAIT_SECONDS="${ORDERBOOKS_SMOKE_WAIT_SECONDS:-900}"
|
||||
RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
|
||||
EVIDENCE_PATH="${ORDERBOOKS_SMOKE_EVIDENCE_PATH:-${MANIFEST_DIR}/vps_runtime_smoke_${RUN_ID}.json}"
|
||||
PYTHON_BIN="${ORDERBOOKS_PYTHON:-python3}"
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: scripts/vps_runtime_smoke_check.sh [options]
|
||||
|
||||
Run on the VPS after installing collector/uploader systemd units. The check
|
||||
records durable JSON evidence, forces one collector service restart, verifies
|
||||
old raw gzip files still parse and keep their checksum, waits for a later valid
|
||||
collector cycle, then starts the uploader service and records upload evidence.
|
||||
|
||||
Options:
|
||||
--app-dir DIR App checkout. Default: /opt/orderbooks.
|
||||
--data-dir DIR Data root. Default: /var/lib/orderbooks.
|
||||
--raw-dir DIR Raw output dir. Default: DATA_DIR/raw_orderbooks.
|
||||
--manifest-dir DIR Manifest dir. Default: DATA_DIR/manifests.
|
||||
--collector-service NAME systemd collector service name.
|
||||
--uploader-service NAME systemd uploader service name.
|
||||
--wait-seconds N Max wait for valid cycles. Default: 900.
|
||||
--evidence-path PATH JSON evidence output path.
|
||||
--help Show this help.
|
||||
|
||||
This script does not delete raw files or manifests. Failures are written to the
|
||||
evidence JSON and should be preserved for review.
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--app-dir) APP_DIR="$2"; shift 2 ;;
|
||||
--data-dir) DATA_DIR="$2"; RAW_DIR="${ORDERBOOKS_OUTPUT_DIR:-$2/raw_orderbooks}"; MANIFEST_DIR="${ORDERBOOKS_MANIFEST_DIR:-$2/manifests}"; shift 2 ;;
|
||||
--raw-dir) RAW_DIR="$2"; shift 2 ;;
|
||||
--manifest-dir) MANIFEST_DIR="$2"; shift 2 ;;
|
||||
--collector-service) COLLECTOR_SERVICE="$2"; shift 2 ;;
|
||||
--uploader-service) UPLOADER_SERVICE="$2"; shift 2 ;;
|
||||
--wait-seconds) WAIT_SECONDS="$2"; shift 2 ;;
|
||||
--evidence-path) EVIDENCE_PATH="$2"; shift 2 ;;
|
||||
--help) usage; exit 0 ;;
|
||||
*) echo "Unknown argument: $1" >&2; usage >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
mkdir -p "$(dirname "${EVIDENCE_PATH}")"
|
||||
|
||||
PYTHONDONTWRITEBYTECODE=1 "${PYTHON_BIN}" - "$APP_DIR" "$DATA_DIR" "$RAW_DIR" "$MANIFEST_DIR" "$COLLECTOR_SERVICE" "$UPLOADER_SERVICE" "$WAIT_SECONDS" "$EVIDENCE_PATH" <<'PY_SMOKE'
|
||||
import datetime as dt
|
||||
import gzip
|
||||
import hashlib
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
app_dir = Path(sys.argv[1])
|
||||
data_dir = Path(sys.argv[2])
|
||||
raw_dir = Path(sys.argv[3])
|
||||
manifest_dir = Path(sys.argv[4])
|
||||
collector_service = sys.argv[5]
|
||||
uploader_service = sys.argv[6]
|
||||
wait_seconds = int(sys.argv[7])
|
||||
evidence_path = Path(sys.argv[8])
|
||||
started = dt.datetime.now(dt.UTC).replace(microsecond=0)
|
||||
checks = []
|
||||
failures = []
|
||||
|
||||
|
||||
def iso_now():
|
||||
return dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat().replace('+00:00', 'Z')
|
||||
|
||||
|
||||
def run(command):
|
||||
proc = subprocess.run(command, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
item = {
|
||||
'command': command,
|
||||
'exit_code': proc.returncode,
|
||||
'stdout_tail': proc.stdout[-4000:],
|
||||
'stderr_tail': proc.stderr[-4000:],
|
||||
'ran_at_utc': iso_now(),
|
||||
}
|
||||
checks.append(item)
|
||||
return item
|
||||
|
||||
|
||||
def sha256(path):
|
||||
digest = hashlib.sha256()
|
||||
with path.open('rb') as handle:
|
||||
for chunk in iter(lambda: handle.read(1024 * 1024), b''):
|
||||
digest.update(chunk)
|
||||
return digest.hexdigest()
|
||||
|
||||
|
||||
def parse_raw(path):
|
||||
rows = 0
|
||||
first_keys = []
|
||||
with gzip.open(path, 'rt', encoding='utf-8') as handle:
|
||||
for line in handle:
|
||||
if not line.strip():
|
||||
continue
|
||||
obj = json.loads(line)
|
||||
if rows == 0:
|
||||
first_keys = sorted(obj.keys())
|
||||
rows += 1
|
||||
return rows, first_keys
|
||||
|
||||
|
||||
def collector_manifests():
|
||||
if not manifest_dir.exists():
|
||||
return []
|
||||
return sorted(manifest_dir.glob('polymarket_orderbook_collector_*.json'), key=lambda path: path.stat().st_mtime)
|
||||
|
||||
|
||||
def validate_collector(path):
|
||||
manifest = json.loads(path.read_text(encoding='utf-8'))
|
||||
output_files = []
|
||||
for item in manifest.get('output_files', []):
|
||||
raw_path = Path(item['path'])
|
||||
rows, first_keys = parse_raw(raw_path)
|
||||
actual_sha = sha256(raw_path)
|
||||
output_files.append({
|
||||
'path': str(raw_path),
|
||||
'bytes': raw_path.stat().st_size,
|
||||
'manifest_rows': item.get('rows'),
|
||||
'rows_parsed': rows,
|
||||
'row_count_matches_manifest': rows == item.get('rows'),
|
||||
'manifest_sha256': item.get('sha256'),
|
||||
'actual_sha256': actual_sha,
|
||||
'sha256_matches_manifest': actual_sha == item.get('sha256'),
|
||||
'first_row_keys': first_keys,
|
||||
'under_raw_dir': raw_path.resolve().is_relative_to(raw_dir.resolve()),
|
||||
'uses_live_sample_path': 'live_sample' in raw_path.parts,
|
||||
})
|
||||
valid = (
|
||||
manifest.get('gate_status') == 'PASS'
|
||||
and manifest.get('rows_written', 0) > 0
|
||||
and manifest.get('failure_count') == 0
|
||||
and not manifest.get('failures')
|
||||
and bool(output_files)
|
||||
and all(item['rows_parsed'] > 0 and item['row_count_matches_manifest'] and item['sha256_matches_manifest'] and item['under_raw_dir'] and not item['uses_live_sample_path'] for item in output_files)
|
||||
)
|
||||
return {
|
||||
'path': str(path),
|
||||
'manifest': manifest,
|
||||
'output_files': output_files,
|
||||
'valid': valid,
|
||||
}
|
||||
|
||||
|
||||
def latest_valid_after(after_mtime=0):
|
||||
deadline = time.time() + wait_seconds
|
||||
last_error = None
|
||||
while time.time() <= deadline:
|
||||
for path in reversed(collector_manifests()):
|
||||
if path.stat().st_mtime <= after_mtime:
|
||||
continue
|
||||
try:
|
||||
result = validate_collector(path)
|
||||
except Exception as exc:
|
||||
last_error = str(exc)
|
||||
continue
|
||||
if result['valid']:
|
||||
return result
|
||||
last_error = f"latest candidate invalid: {path}"
|
||||
time.sleep(10)
|
||||
raise TimeoutError(last_error or f'no valid collector manifest after mtime {after_mtime}')
|
||||
|
||||
|
||||
def latest_upload_after(after_mtime=0):
|
||||
candidates = sorted(manifest_dir.glob('upload_archive_*.json'), key=lambda path: path.stat().st_mtime)
|
||||
candidates = [path for path in candidates if path.stat().st_mtime >= after_mtime]
|
||||
if not candidates:
|
||||
raise FileNotFoundError('no upload_archive_*.json manifest found after uploader run')
|
||||
path = candidates[-1]
|
||||
manifest = json.loads(path.read_text(encoding='utf-8'))
|
||||
verified_count = manifest.get('counts', {}).get('verified', len(manifest.get('verified_files', [])))
|
||||
return {
|
||||
'path': str(path),
|
||||
'manifest': manifest,
|
||||
'verified_count': verified_count,
|
||||
'valid': manifest.get('operation_status') == 'UPLOAD_VERIFIED' and manifest.get('gate_status') == 'PASS' and manifest.get('rclone', {}).get('copy_exit_code') == 0 and manifest.get('rclone', {}).get('check_exit_code') == 0 and verified_count > 0,
|
||||
}
|
||||
|
||||
summary = {
|
||||
'schema_name': 'vps_runtime_smoke_result',
|
||||
'schema_version': 1,
|
||||
'started_at_utc': started.isoformat().replace('+00:00', 'Z'),
|
||||
'ended_at_utc': None,
|
||||
'gate_status': 'ERROR',
|
||||
'production_ready': False,
|
||||
'app_dir': str(app_dir),
|
||||
'data_dir': str(data_dir),
|
||||
'raw_dir': str(raw_dir),
|
||||
'manifest_dir': str(manifest_dir),
|
||||
'collector_service': collector_service,
|
||||
'uploader_service': uploader_service,
|
||||
'wait_seconds': wait_seconds,
|
||||
'checks': checks,
|
||||
'failures': failures,
|
||||
}
|
||||
|
||||
try:
|
||||
active = run(['systemctl', 'is-active', collector_service])
|
||||
if active['exit_code'] != 0:
|
||||
failures.append('collector service is not active under systemd')
|
||||
raise RuntimeError('collector service not active')
|
||||
|
||||
before = latest_valid_after(0)
|
||||
before_mtime = Path(before['path']).stat().st_mtime
|
||||
old_raw = before['output_files'][0]
|
||||
old_raw_sha = old_raw['actual_sha256']
|
||||
old_raw_path = Path(old_raw['path'])
|
||||
|
||||
restart = run(['systemctl', 'restart', collector_service])
|
||||
if restart['exit_code'] != 0:
|
||||
failures.append('collector service restart command failed')
|
||||
raise RuntimeError('restart failed')
|
||||
active_after = run(['systemctl', 'is-active', collector_service])
|
||||
if active_after['exit_code'] != 0:
|
||||
failures.append('collector service is not active after restart')
|
||||
raise RuntimeError('collector inactive after restart')
|
||||
|
||||
after = latest_valid_after(before_mtime)
|
||||
old_rows_after, _ = parse_raw(old_raw_path)
|
||||
old_file_unchanged = sha256(old_raw_path) == old_raw_sha and old_rows_after == old_raw['rows_parsed']
|
||||
if not old_file_unchanged:
|
||||
failures.append('raw file from before restart changed or stopped parsing')
|
||||
|
||||
upload_start_mtime = time.time()
|
||||
upload_run = run(['systemctl', 'start', uploader_service])
|
||||
if upload_run['exit_code'] != 0:
|
||||
failures.append('uploader service start failed')
|
||||
try:
|
||||
upload = latest_upload_after(upload_start_mtime - 2)
|
||||
if not upload.get('valid'):
|
||||
failures.append('uploader did not produce a verified upload manifest with at least one verified file')
|
||||
except Exception as exc:
|
||||
upload = {'path': None, 'valid': False, 'error': str(exc)}
|
||||
failures.append(str(exc))
|
||||
|
||||
collector_logs = run(['journalctl', '-u', collector_service, '-n', '80', '--no-pager'])
|
||||
uploader_logs = run(['journalctl', '-u', uploader_service, '-n', '80', '--no-pager'])
|
||||
|
||||
summary.update({
|
||||
'before_restart_collector': before,
|
||||
'after_restart_collector': after,
|
||||
'old_raw_file_unchanged_after_restart': old_file_unchanged,
|
||||
'upload_result': upload,
|
||||
'collector_log_check_exit_code': collector_logs['exit_code'],
|
||||
'uploader_log_check_exit_code': uploader_logs['exit_code'],
|
||||
})
|
||||
if after['valid'] and old_file_unchanged and upload.get('valid') and not failures:
|
||||
summary['gate_status'] = 'PASS'
|
||||
else:
|
||||
summary['gate_status'] = 'FAIL'
|
||||
except Exception as exc:
|
||||
failures.append(str(exc))
|
||||
summary['exception'] = repr(exc)
|
||||
finally:
|
||||
summary['ended_at_utc'] = iso_now()
|
||||
evidence_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
evidence_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + '\n', encoding='utf-8')
|
||||
|
||||
print(f"SMOKE_EVIDENCE={evidence_path}")
|
||||
print(f"SMOKE_GATE={summary['gate_status']}")
|
||||
if summary['gate_status'] != 'PASS':
|
||||
sys.exit(1)
|
||||
PY_SMOKE
|
||||
38
systemd/polymarket-orderbook-collector.service
Normal file
38
systemd/polymarket-orderbook-collector.service
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
[Unit]
|
||||
Description=Polymarket raw order-book collector cycle
|
||||
Documentation=file:/opt/orderbooks/docs/VPS_DEPLOYMENT.md
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
StartLimitIntervalSec=10min
|
||||
StartLimitBurst=20
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=orderbooks
|
||||
Group=orderbooks
|
||||
WorkingDirectory=/opt/orderbooks
|
||||
Environment=PYTHONUNBUFFERED=1
|
||||
Environment=ORDERBOOKS_APP_DIR=/opt/orderbooks
|
||||
Environment=ORDERBOOKS_DATA_DIR=/var/lib/orderbooks
|
||||
Environment=ORDERBOOKS_OUTPUT_DIR=/var/lib/orderbooks/raw_orderbooks
|
||||
Environment=ORDERBOOKS_PYTHON=/opt/orderbooks/.venv/bin/python
|
||||
Environment=ORDERBOOKS_COLLECTOR_CONFIG=/etc/orderbooks/polymarket_collector.vps.yaml
|
||||
EnvironmentFile=-/etc/orderbooks/polymarket-orderbook-collector.env
|
||||
ExecStart=/bin/bash /opt/orderbooks/scripts/run_polymarket_collector_cycle.sh
|
||||
Restart=always
|
||||
RestartSec=30s
|
||||
TimeoutStopSec=90s
|
||||
KillSignal=SIGTERM
|
||||
KillMode=control-group
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=polymarket-orderbook-collector
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
ProtectSystem=strict
|
||||
ProtectHome=true
|
||||
ReadWritePaths=/var/lib/orderbooks
|
||||
StateDirectory=orderbooks
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
29
systemd/polymarket-orderbook-uploader.service
Normal file
29
systemd/polymarket-orderbook-uploader.service
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
[Unit]
|
||||
Description=Orderbooks archive upload via rclone
|
||||
Documentation=file:/opt/orderbooks/docs/GOOGLE_DRIVE_OFFLOAD.md
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=orderbooks
|
||||
Group=orderbooks
|
||||
WorkingDirectory=/opt/orderbooks
|
||||
Environment=ORDERBOOKS_UPLOAD_DATA_DIR=/var/lib/orderbooks
|
||||
Environment=ORDERBOOKS_UPLOAD_MANIFEST_DIR=/var/lib/orderbooks/manifests
|
||||
Environment=ORDERBOOKS_UPLOAD_RAW_DIR=/var/lib/orderbooks/raw_orderbooks
|
||||
Environment=ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS=600
|
||||
Environment=ORDERBOOKS_UPLOAD_RETENTION_DAYS=7
|
||||
Environment=ORDERBOOKS_RCLONE_BIN=/usr/bin/rclone
|
||||
EnvironmentFile=-/etc/orderbooks/orderbook-uploader.env
|
||||
ExecStart=/bin/bash /opt/orderbooks/scripts/upload_archive_rclone.sh --execute
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=polymarket-orderbook-uploader
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
ProtectSystem=strict
|
||||
ProtectHome=true
|
||||
ReadWritePaths=/var/lib/orderbooks
|
||||
StateDirectory=orderbooks
|
||||
|
||||
12
systemd/polymarket-orderbook-uploader.timer
Normal file
12
systemd/polymarket-orderbook-uploader.timer
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
[Unit]
|
||||
Description=Run orderbooks archive upload periodically
|
||||
Documentation=file:/opt/orderbooks/docs/GOOGLE_DRIVE_OFFLOAD.md
|
||||
|
||||
[Timer]
|
||||
OnCalendar=hourly
|
||||
RandomizedDelaySec=10min
|
||||
Persistent=true
|
||||
Unit=polymarket-orderbook-uploader.service
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
Loading…
Add table
Reference in a new issue