orderbooks/scripts/normalize_polymarket_orderbooks.py
philipp 284e465588
Some checks failed
deploy / deploy (push) Has been cancelled
Prepare Kubernetes orderbooks deployment
2026-04-18 11:23:28 +02:00

496 lines
19 KiB
Python

#!/usr/bin/env python3
"""Normalize raw Polymarket order-book snapshots from the sample collector.
Checkpoint 5 scope: derive a bounded normalized gzip JSONL sample from the raw
Checkpoint 4 sample. Raw files remain the source of truth; every normalized row
keeps the raw file path and gzip JSONL line number.
"""
from __future__ import annotations
import argparse
import datetime as dt
import gzip
import hashlib
import json
import sys
from decimal import Decimal, InvalidOperation, getcontext
from pathlib import Path
from typing import Any
NORMALIZER_NAME = "polymarket_orderbook_normalizer"
NORMALIZER_VERSION = "0.1.0"
SCHEMA_NAME = "normalized_orderbook_snapshot"
SCHEMA_VERSION = 1
DEFAULT_INPUT_MANIFEST = Path("data/manifests/orderbook_collector_sample_manifest.json")
DEFAULT_OUTPUT_DIR = Path("data/normalized_sample")
DEFAULT_MANIFEST_PATH = Path("data/manifests/orderbook_normalization_sample_manifest.json")
CENT_OFFSETS = {
"1c": Decimal("0.01"),
"2c": Decimal("0.02"),
"5c": Decimal("0.05"),
}
SECRET_PATTERNS = (
"set-" "coo" "kie",
"__cf" "_bm",
"cf" "_bm",
"author" "ization",
"private" "_key",
"api" "_secret",
"poly" "_signature",
"poly" "_passphrase",
"poly" "_address",
"bear" "er",
"coo" "kie",
"wallet" " material",
)
getcontext().prec = 50
def utc_now() -> dt.datetime:
return dt.datetime.now(dt.UTC)
def iso_z(value: dt.datetime | None = None) -> str:
value = value or utc_now()
return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
def compact_timestamp(value: dt.datetime | None = None) -> str:
value = value or utc_now()
return value.astimezone(dt.UTC).strftime("%Y%m%dT%H%M%SZ")
def sha256_file(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def decimal_from_raw(value: Any, field_name: str) -> Decimal:
if not isinstance(value, str):
raise ValueError(f"{field_name} is not a string: {value!r}")
try:
parsed = Decimal(value)
except InvalidOperation as exc:
raise ValueError(f"{field_name} is not a decimal: {value!r}") from exc
if not parsed.is_finite():
raise ValueError(f"{field_name} is not finite: {value!r}")
return parsed
def decimal_to_json(value: Decimal | None) -> str | None:
if value is None:
return None
if value == 0:
return "0"
return format(value.normalize(), "f")
def load_json(path: Path) -> dict[str, Any]:
with path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
if not isinstance(data, dict):
raise ValueError(f"{path} did not contain a JSON object")
return data
def resolve_repo_path(path_text: str) -> Path:
path = Path(path_text)
if path.is_absolute():
return path
return Path.cwd() / path
def normalize_side(levels: Any, side_name: str) -> list[tuple[Decimal, Decimal]]:
if not isinstance(levels, list):
raise ValueError(f"raw.{side_name} is not a list")
normalized: list[tuple[Decimal, Decimal]] = []
for index, level in enumerate(levels):
if not isinstance(level, dict):
raise ValueError(f"raw.{side_name}[{index}] is not an object")
price = decimal_from_raw(level.get("price"), f"raw.{side_name}[{index}].price")
size = decimal_from_raw(level.get("size"), f"raw.{side_name}[{index}].size")
if size < 0:
raise ValueError(f"raw.{side_name}[{index}].size is negative")
normalized.append((price, size))
return normalized
def sum_sizes(levels: list[tuple[Decimal, Decimal]]) -> Decimal:
return sum((size for _, size in levels), Decimal("0"))
def normalize_raw_row(raw_row: dict[str, Any], raw_file: str, raw_line_number: int) -> dict[str, Any]:
raw_book = raw_row.get("raw")
market = raw_row.get("market")
collection = raw_row.get("collection")
if not isinstance(raw_book, dict):
raise ValueError("raw is not an object")
if not isinstance(market, dict):
raise ValueError("market is not an object")
if not isinstance(collection, dict):
raise ValueError("collection is not an object")
bids = normalize_side(raw_book.get("bids"), "bids")
asks = normalize_side(raw_book.get("asks"), "asks")
best_bid = max((price for price, _ in bids), default=None)
best_ask = min((price for price, _ in asks), default=None)
spread = None
midpoint = None
if best_bid is not None and best_ask is not None:
spread = best_ask - best_bid
midpoint = (best_bid + best_ask) / Decimal("2")
bid_depth_total = sum_sizes(bids)
ask_depth_total = sum_sizes(asks)
row: dict[str, Any] = {
"schema_name": SCHEMA_NAME,
"schema_version": SCHEMA_VERSION,
"market_name": market.get("market_name"),
"market_slug": market.get("market_slug"),
"condition_id": market.get("condition_id"),
"token_id": market.get("token_id"),
"outcome": market.get("outcome"),
"collected_at_utc": collection.get("collected_at_utc"),
"best_bid": decimal_to_json(best_bid),
"best_ask": decimal_to_json(best_ask),
"spread": decimal_to_json(spread),
"midpoint": decimal_to_json(midpoint),
"bid_depth_total": decimal_to_json(bid_depth_total),
"ask_depth_total": decimal_to_json(ask_depth_total),
"raw_file": raw_file,
"raw_line_number": raw_line_number,
}
for label, offset in CENT_OFFSETS.items():
bid_depth = Decimal("0")
if best_bid is not None:
threshold = best_bid - offset
bid_depth = sum((size for price, size in bids if price >= threshold), Decimal("0"))
ask_depth = Decimal("0")
if best_ask is not None:
threshold = best_ask + offset
ask_depth = sum((size for price, size in asks if price <= threshold), Decimal("0"))
row[f"bid_depth_within_{label}"] = decimal_to_json(bid_depth)
row[f"ask_depth_within_{label}"] = decimal_to_json(ask_depth)
return row
def summarize_output(path: Path, rows: int) -> dict[str, Any]:
return {
"path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path),
"rows": rows,
"bytes": path.stat().st_size,
"sha256": sha256_file(path),
"status": "valid",
}
def build_input_file_summary(manifest: dict[str, Any]) -> list[dict[str, Any]]:
files = manifest.get("output_files")
if not isinstance(files, list) or not files:
raise ValueError("input manifest has no output_files")
summaries: list[dict[str, Any]] = []
for file_entry in files:
if not isinstance(file_entry, dict):
raise ValueError("input manifest output_files entry is not an object")
path_text = file_entry.get("path")
if not isinstance(path_text, str) or not path_text:
raise ValueError("input manifest output_files entry lacks path")
path = resolve_repo_path(path_text)
if not path.exists():
raise FileNotFoundError(path)
actual_sha = sha256_file(path)
expected_sha = file_entry.get("sha256")
checksum_match = expected_sha == actual_sha
summaries.append(
{
"path": path_text,
"rows_expected": file_entry.get("rows"),
"bytes": path.stat().st_size,
"sha256": actual_sha,
"input_manifest_sha256": expected_sha,
"checksum_match": checksum_match,
"status": "valid" if checksum_match else "invalid",
}
)
return summaries
def read_and_normalize(
input_files: list[dict[str, Any]],
output_path: Path,
) -> tuple[int, int, list[dict[str, Any]], dict[str, Any]]:
raw_rows_read = 0
normalized_rows_written = 0
errors: list[dict[str, Any]] = []
sanity = {
"raw_file_refs_present": True,
"raw_files_exist": True,
"spread_non_negative": True,
"midpoint_between_bid_ask": True,
"depth_totals_non_negative": True,
"outcomes_seen": [],
"gzip_jsonl_parseable": True,
"row_count_match": None,
}
outcomes_seen: set[str] = set()
output_path.parent.mkdir(parents=True, exist_ok=True)
with gzip.open(output_path, "wt", encoding="utf-8", compresslevel=9) as output:
for file_entry in input_files:
raw_file = file_entry["path"]
raw_path = resolve_repo_path(raw_file)
if not raw_path.exists():
sanity["raw_files_exist"] = False
errors.append({"raw_file": raw_file, "error": "raw file missing"})
continue
with gzip.open(raw_path, "rt", encoding="utf-8") as raw_handle:
for raw_line_number, line in enumerate(raw_handle, 1):
raw_rows_read += 1
try:
raw_row = json.loads(line)
normalized = normalize_raw_row(raw_row, raw_file, raw_line_number)
output.write(json.dumps(normalized, sort_keys=True, separators=(",", ":")) + "\n")
normalized_rows_written += 1
if not normalized.get("raw_file") or not normalized.get("raw_line_number"):
sanity["raw_file_refs_present"] = False
if not resolve_repo_path(str(normalized["raw_file"])).exists():
sanity["raw_files_exist"] = False
outcome = normalized.get("outcome")
if isinstance(outcome, str):
outcomes_seen.add(outcome)
best_bid = Decimal(normalized["best_bid"]) if normalized["best_bid"] is not None else None
best_ask = Decimal(normalized["best_ask"]) if normalized["best_ask"] is not None else None
spread = Decimal(normalized["spread"]) if normalized["spread"] is not None else None
midpoint = Decimal(normalized["midpoint"]) if normalized["midpoint"] is not None else None
if best_bid is not None and best_ask is not None:
if spread is None or spread < 0:
sanity["spread_non_negative"] = False
if midpoint is None or midpoint < best_bid or midpoint > best_ask:
sanity["midpoint_between_bid_ask"] = False
depth_fields = [
"bid_depth_total",
"ask_depth_total",
"bid_depth_within_1c",
"ask_depth_within_1c",
"bid_depth_within_2c",
"ask_depth_within_2c",
"bid_depth_within_5c",
"ask_depth_within_5c",
]
for field in depth_fields:
if Decimal(normalized[field]) < 0:
sanity["depth_totals_non_negative"] = False
except Exception as exc: # noqa: BLE001 - preserve row-level failure evidence.
errors.append(
{
"raw_file": raw_file,
"raw_line_number": raw_line_number,
"error": str(exc),
}
)
sanity["outcomes_seen"] = sorted(outcomes_seen)
sanity["has_up_and_down"] = {"Up", "Down"}.issubset(outcomes_seen)
sanity["row_count_match"] = raw_rows_read == normalized_rows_written + len(errors)
return raw_rows_read, normalized_rows_written, errors, sanity
def validate_output_gzip_jsonl(path: Path) -> tuple[bool, int, list[str]]:
errors: list[str] = []
parsed_rows = 0
try:
with gzip.open(path, "rt", encoding="utf-8") as handle:
for line_number, line in enumerate(handle, 1):
json.loads(line)
parsed_rows = line_number
except Exception as exc: # noqa: BLE001 - validation result belongs in manifest.
errors.append(str(exc))
return not errors, parsed_rows, errors
def scan_for_secret_terms(paths: list[Path]) -> dict[str, Any]:
matches: list[dict[str, Any]] = []
lowered_patterns = tuple(pattern.lower() for pattern in SECRET_PATTERNS)
for path in paths:
if not path.exists():
continue
if path.suffix == ".gz":
opener = gzip.open
else:
opener = open
with opener(path, "rt", encoding="utf-8", errors="replace") as handle: # type: ignore[arg-type]
for line_number, line in enumerate(handle, 1):
lower = line.lower()
for pattern_index, pattern in enumerate(lowered_patterns, 1):
if pattern in lower:
matches.append(
{
"path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path),
"line_number": line_number,
"term_index": pattern_index,
}
)
break
return {
"passed": not matches,
"checked_term_count": len(SECRET_PATTERNS),
"matches": matches,
}
def parse_args(argv: list[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Normalize Checkpoint 4 raw Polymarket order-book snapshots.",
)
parser.add_argument(
"--input-manifest",
type=Path,
default=DEFAULT_INPUT_MANIFEST,
help=f"Raw collector manifest path. Default: {DEFAULT_INPUT_MANIFEST}",
)
parser.add_argument(
"--output-dir",
type=Path,
default=DEFAULT_OUTPUT_DIR,
help=f"Normalized sample base directory. Default: {DEFAULT_OUTPUT_DIR}",
)
parser.add_argument(
"--manifest-path",
type=Path,
default=DEFAULT_MANIFEST_PATH,
help=f"Normalization manifest path. Default: {DEFAULT_MANIFEST_PATH}",
)
return parser.parse_args(argv)
def main(argv: list[str]) -> int:
args = parse_args(argv)
started = utc_now()
input_manifest = load_json(args.input_manifest)
input_files = build_input_file_summary(input_manifest)
run_id = compact_timestamp(started)
output_path = (
args.output_dir
/ "polymarket"
/ "orderbooks"
/ run_id
/ f"polymarket_orderbooks_normalized_{run_id}.jsonl.gz"
)
raw_rows_read, normalized_rows_written, row_errors, sanity = read_and_normalize(input_files, output_path)
gzip_ok, gzip_rows, gzip_errors = validate_output_gzip_jsonl(output_path)
output_summary = summarize_output(output_path, normalized_rows_written)
sanity.update(
{
"output_row_count_equals_raw_input_row_count": normalized_rows_written == raw_rows_read
if not row_errors
else False,
"gzip_jsonl_decompresses_and_parses": gzip_ok,
"gzip_jsonl_rows_parsed": gzip_rows,
"gzip_jsonl_errors": gzip_errors,
"manifest_checksum_matches_output": output_summary["sha256"] == sha256_file(output_path),
"all_input_file_checksums_match": all(file_entry["checksum_match"] for file_entry in input_files),
}
)
secret_scan = scan_for_secret_terms([Path(__file__), output_path])
sanity["checkpoint5_secret_scan_passed"] = secret_scan["passed"]
gate_checks = [
normalized_rows_written == raw_rows_read,
not row_errors,
sanity["raw_file_refs_present"],
sanity["raw_files_exist"],
sanity["spread_non_negative"],
sanity["midpoint_between_bid_ask"],
sanity["depth_totals_non_negative"],
sanity["has_up_and_down"],
gzip_ok,
sanity["manifest_checksum_matches_output"],
secret_scan["passed"],
all(file_entry["checksum_match"] for file_entry in input_files),
]
gate_status = "PASS" if all(gate_checks) and normalized_rows_written > 0 else "FAIL"
ended = utc_now()
manifest = {
"schema_name": "orderbook_normalization_sample_manifest",
"schema_version": 1,
"checkpoint_id": 5,
"checkpoint_name": "Normalized Snapshot Extract",
"normalizer": {
"name": NORMALIZER_NAME,
"version": NORMALIZER_VERSION,
},
"started_at_utc": iso_z(started),
"ended_at_utc": iso_z(ended),
"run_duration_seconds": round((ended - started).total_seconds(), 3),
"command": "scripts/normalize_polymarket_orderbooks.py",
"input_manifest": {
"path": str(args.input_manifest),
"sha256": sha256_file(args.input_manifest),
"collector_manifest_schema_name": input_manifest.get("schema_name"),
"collector_gate_status": input_manifest.get("gate_status"),
},
"input_files": input_files,
"output_files": [output_summary],
"raw_rows_read": raw_rows_read,
"normalized_rows_written": normalized_rows_written,
"skipped_rows": len(row_errors),
"error_rows": row_errors,
"numeric_encoding": "Exact decimal values are emitted as JSON strings; missing price-derived values are null.",
"sanity_checks": sanity,
"secret_scan": secret_scan,
"warnings": [],
"known_gaps": [
"This is a derived sample extract only; raw gzip JSONL remains the source of truth.",
"No upload, daemon runtime, systemd unit, dashboard, database, strategy, backtest, or trading behavior is included.",
"The sample proves normalization logic on one bounded raw run, not long-run schema stability.",
],
"fake_progress_risk": "A clean normalized sample can hide raw collection gaps and endpoint schema drift; every row is therefore traceable to raw_file and raw_line_number, and reliability remains gated on later soak testing.",
"next_step": "Checkpoint 6 should package the raw collector for a VPS runtime, or the orchestrator can request review of this normalized sample first.",
"gate_status": gate_status,
}
args.manifest_path.parent.mkdir(parents=True, exist_ok=True)
args.manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
print(
json.dumps(
{
"gate_status": gate_status,
"manifest_path": str(args.manifest_path),
"output_path": str(output_path),
"raw_rows_read": raw_rows_read,
"normalized_rows_written": normalized_rows_written,
"skipped_rows": len(row_errors),
"sha256": output_summary["sha256"],
},
indent=2,
sort_keys=True,
)
)
return 0 if gate_status == "PASS" else 1
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))