#!/usr/bin/env python3 """Normalize raw Polymarket order-book snapshots from the sample collector. Checkpoint 5 scope: derive a bounded normalized gzip JSONL sample from the raw Checkpoint 4 sample. Raw files remain the source of truth; every normalized row keeps the raw file path and gzip JSONL line number. """ from __future__ import annotations import argparse import datetime as dt import gzip import hashlib import json import sys from decimal import Decimal, InvalidOperation, getcontext from pathlib import Path from typing import Any NORMALIZER_NAME = "polymarket_orderbook_normalizer" NORMALIZER_VERSION = "0.1.0" SCHEMA_NAME = "normalized_orderbook_snapshot" SCHEMA_VERSION = 1 DEFAULT_INPUT_MANIFEST = Path("data/manifests/orderbook_collector_sample_manifest.json") DEFAULT_OUTPUT_DIR = Path("data/normalized_sample") DEFAULT_MANIFEST_PATH = Path("data/manifests/orderbook_normalization_sample_manifest.json") CENT_OFFSETS = { "1c": Decimal("0.01"), "2c": Decimal("0.02"), "5c": Decimal("0.05"), } SECRET_PATTERNS = ( "set-" "coo" "kie", "__cf" "_bm", "cf" "_bm", "author" "ization", "private" "_key", "api" "_secret", "poly" "_signature", "poly" "_passphrase", "poly" "_address", "bear" "er", "coo" "kie", "wallet" " material", ) getcontext().prec = 50 def utc_now() -> dt.datetime: return dt.datetime.now(dt.UTC) def iso_z(value: dt.datetime | None = None) -> str: value = value or utc_now() return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") def compact_timestamp(value: dt.datetime | None = None) -> str: value = value or utc_now() return value.astimezone(dt.UTC).strftime("%Y%m%dT%H%M%SZ") def sha256_file(path: Path) -> str: digest = hashlib.sha256() with path.open("rb") as handle: for chunk in iter(lambda: handle.read(1024 * 1024), b""): digest.update(chunk) return digest.hexdigest() def decimal_from_raw(value: Any, field_name: str) -> Decimal: if not isinstance(value, str): raise ValueError(f"{field_name} is not a string: {value!r}") try: parsed = Decimal(value) except InvalidOperation as exc: raise ValueError(f"{field_name} is not a decimal: {value!r}") from exc if not parsed.is_finite(): raise ValueError(f"{field_name} is not finite: {value!r}") return parsed def decimal_to_json(value: Decimal | None) -> str | None: if value is None: return None if value == 0: return "0" return format(value.normalize(), "f") def load_json(path: Path) -> dict[str, Any]: with path.open("r", encoding="utf-8") as handle: data = json.load(handle) if not isinstance(data, dict): raise ValueError(f"{path} did not contain a JSON object") return data def resolve_repo_path(path_text: str) -> Path: path = Path(path_text) if path.is_absolute(): return path return Path.cwd() / path def normalize_side(levels: Any, side_name: str) -> list[tuple[Decimal, Decimal]]: if not isinstance(levels, list): raise ValueError(f"raw.{side_name} is not a list") normalized: list[tuple[Decimal, Decimal]] = [] for index, level in enumerate(levels): if not isinstance(level, dict): raise ValueError(f"raw.{side_name}[{index}] is not an object") price = decimal_from_raw(level.get("price"), f"raw.{side_name}[{index}].price") size = decimal_from_raw(level.get("size"), f"raw.{side_name}[{index}].size") if size < 0: raise ValueError(f"raw.{side_name}[{index}].size is negative") normalized.append((price, size)) return normalized def sum_sizes(levels: list[tuple[Decimal, Decimal]]) -> Decimal: return sum((size for _, size in levels), Decimal("0")) def normalize_raw_row(raw_row: dict[str, Any], raw_file: str, raw_line_number: int) -> dict[str, Any]: raw_book = raw_row.get("raw") market = raw_row.get("market") collection = raw_row.get("collection") if not isinstance(raw_book, dict): raise ValueError("raw is not an object") if not isinstance(market, dict): raise ValueError("market is not an object") if not isinstance(collection, dict): raise ValueError("collection is not an object") bids = normalize_side(raw_book.get("bids"), "bids") asks = normalize_side(raw_book.get("asks"), "asks") best_bid = max((price for price, _ in bids), default=None) best_ask = min((price for price, _ in asks), default=None) spread = None midpoint = None if best_bid is not None and best_ask is not None: spread = best_ask - best_bid midpoint = (best_bid + best_ask) / Decimal("2") bid_depth_total = sum_sizes(bids) ask_depth_total = sum_sizes(asks) row: dict[str, Any] = { "schema_name": SCHEMA_NAME, "schema_version": SCHEMA_VERSION, "market_name": market.get("market_name"), "market_slug": market.get("market_slug"), "condition_id": market.get("condition_id"), "token_id": market.get("token_id"), "outcome": market.get("outcome"), "collected_at_utc": collection.get("collected_at_utc"), "best_bid": decimal_to_json(best_bid), "best_ask": decimal_to_json(best_ask), "spread": decimal_to_json(spread), "midpoint": decimal_to_json(midpoint), "bid_depth_total": decimal_to_json(bid_depth_total), "ask_depth_total": decimal_to_json(ask_depth_total), "raw_file": raw_file, "raw_line_number": raw_line_number, } for label, offset in CENT_OFFSETS.items(): bid_depth = Decimal("0") if best_bid is not None: threshold = best_bid - offset bid_depth = sum((size for price, size in bids if price >= threshold), Decimal("0")) ask_depth = Decimal("0") if best_ask is not None: threshold = best_ask + offset ask_depth = sum((size for price, size in asks if price <= threshold), Decimal("0")) row[f"bid_depth_within_{label}"] = decimal_to_json(bid_depth) row[f"ask_depth_within_{label}"] = decimal_to_json(ask_depth) return row def summarize_output(path: Path, rows: int) -> dict[str, Any]: return { "path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path), "rows": rows, "bytes": path.stat().st_size, "sha256": sha256_file(path), "status": "valid", } def build_input_file_summary(manifest: dict[str, Any]) -> list[dict[str, Any]]: files = manifest.get("output_files") if not isinstance(files, list) or not files: raise ValueError("input manifest has no output_files") summaries: list[dict[str, Any]] = [] for file_entry in files: if not isinstance(file_entry, dict): raise ValueError("input manifest output_files entry is not an object") path_text = file_entry.get("path") if not isinstance(path_text, str) or not path_text: raise ValueError("input manifest output_files entry lacks path") path = resolve_repo_path(path_text) if not path.exists(): raise FileNotFoundError(path) actual_sha = sha256_file(path) expected_sha = file_entry.get("sha256") checksum_match = expected_sha == actual_sha summaries.append( { "path": path_text, "rows_expected": file_entry.get("rows"), "bytes": path.stat().st_size, "sha256": actual_sha, "input_manifest_sha256": expected_sha, "checksum_match": checksum_match, "status": "valid" if checksum_match else "invalid", } ) return summaries def read_and_normalize( input_files: list[dict[str, Any]], output_path: Path, ) -> tuple[int, int, list[dict[str, Any]], dict[str, Any]]: raw_rows_read = 0 normalized_rows_written = 0 errors: list[dict[str, Any]] = [] sanity = { "raw_file_refs_present": True, "raw_files_exist": True, "spread_non_negative": True, "midpoint_between_bid_ask": True, "depth_totals_non_negative": True, "outcomes_seen": [], "gzip_jsonl_parseable": True, "row_count_match": None, } outcomes_seen: set[str] = set() output_path.parent.mkdir(parents=True, exist_ok=True) with gzip.open(output_path, "wt", encoding="utf-8", compresslevel=9) as output: for file_entry in input_files: raw_file = file_entry["path"] raw_path = resolve_repo_path(raw_file) if not raw_path.exists(): sanity["raw_files_exist"] = False errors.append({"raw_file": raw_file, "error": "raw file missing"}) continue with gzip.open(raw_path, "rt", encoding="utf-8") as raw_handle: for raw_line_number, line in enumerate(raw_handle, 1): raw_rows_read += 1 try: raw_row = json.loads(line) normalized = normalize_raw_row(raw_row, raw_file, raw_line_number) output.write(json.dumps(normalized, sort_keys=True, separators=(",", ":")) + "\n") normalized_rows_written += 1 if not normalized.get("raw_file") or not normalized.get("raw_line_number"): sanity["raw_file_refs_present"] = False if not resolve_repo_path(str(normalized["raw_file"])).exists(): sanity["raw_files_exist"] = False outcome = normalized.get("outcome") if isinstance(outcome, str): outcomes_seen.add(outcome) best_bid = Decimal(normalized["best_bid"]) if normalized["best_bid"] is not None else None best_ask = Decimal(normalized["best_ask"]) if normalized["best_ask"] is not None else None spread = Decimal(normalized["spread"]) if normalized["spread"] is not None else None midpoint = Decimal(normalized["midpoint"]) if normalized["midpoint"] is not None else None if best_bid is not None and best_ask is not None: if spread is None or spread < 0: sanity["spread_non_negative"] = False if midpoint is None or midpoint < best_bid or midpoint > best_ask: sanity["midpoint_between_bid_ask"] = False depth_fields = [ "bid_depth_total", "ask_depth_total", "bid_depth_within_1c", "ask_depth_within_1c", "bid_depth_within_2c", "ask_depth_within_2c", "bid_depth_within_5c", "ask_depth_within_5c", ] for field in depth_fields: if Decimal(normalized[field]) < 0: sanity["depth_totals_non_negative"] = False except Exception as exc: # noqa: BLE001 - preserve row-level failure evidence. errors.append( { "raw_file": raw_file, "raw_line_number": raw_line_number, "error": str(exc), } ) sanity["outcomes_seen"] = sorted(outcomes_seen) sanity["has_up_and_down"] = {"Up", "Down"}.issubset(outcomes_seen) sanity["row_count_match"] = raw_rows_read == normalized_rows_written + len(errors) return raw_rows_read, normalized_rows_written, errors, sanity def validate_output_gzip_jsonl(path: Path) -> tuple[bool, int, list[str]]: errors: list[str] = [] parsed_rows = 0 try: with gzip.open(path, "rt", encoding="utf-8") as handle: for line_number, line in enumerate(handle, 1): json.loads(line) parsed_rows = line_number except Exception as exc: # noqa: BLE001 - validation result belongs in manifest. errors.append(str(exc)) return not errors, parsed_rows, errors def scan_for_secret_terms(paths: list[Path]) -> dict[str, Any]: matches: list[dict[str, Any]] = [] lowered_patterns = tuple(pattern.lower() for pattern in SECRET_PATTERNS) for path in paths: if not path.exists(): continue if path.suffix == ".gz": opener = gzip.open else: opener = open with opener(path, "rt", encoding="utf-8", errors="replace") as handle: # type: ignore[arg-type] for line_number, line in enumerate(handle, 1): lower = line.lower() for pattern_index, pattern in enumerate(lowered_patterns, 1): if pattern in lower: matches.append( { "path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path), "line_number": line_number, "term_index": pattern_index, } ) break return { "passed": not matches, "checked_term_count": len(SECRET_PATTERNS), "matches": matches, } def parse_args(argv: list[str]) -> argparse.Namespace: parser = argparse.ArgumentParser( description="Normalize Checkpoint 4 raw Polymarket order-book snapshots.", ) parser.add_argument( "--input-manifest", type=Path, default=DEFAULT_INPUT_MANIFEST, help=f"Raw collector manifest path. Default: {DEFAULT_INPUT_MANIFEST}", ) parser.add_argument( "--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR, help=f"Normalized sample base directory. Default: {DEFAULT_OUTPUT_DIR}", ) parser.add_argument( "--manifest-path", type=Path, default=DEFAULT_MANIFEST_PATH, help=f"Normalization manifest path. Default: {DEFAULT_MANIFEST_PATH}", ) return parser.parse_args(argv) def main(argv: list[str]) -> int: args = parse_args(argv) started = utc_now() input_manifest = load_json(args.input_manifest) input_files = build_input_file_summary(input_manifest) run_id = compact_timestamp(started) output_path = ( args.output_dir / "polymarket" / "orderbooks" / run_id / f"polymarket_orderbooks_normalized_{run_id}.jsonl.gz" ) raw_rows_read, normalized_rows_written, row_errors, sanity = read_and_normalize(input_files, output_path) gzip_ok, gzip_rows, gzip_errors = validate_output_gzip_jsonl(output_path) output_summary = summarize_output(output_path, normalized_rows_written) sanity.update( { "output_row_count_equals_raw_input_row_count": normalized_rows_written == raw_rows_read if not row_errors else False, "gzip_jsonl_decompresses_and_parses": gzip_ok, "gzip_jsonl_rows_parsed": gzip_rows, "gzip_jsonl_errors": gzip_errors, "manifest_checksum_matches_output": output_summary["sha256"] == sha256_file(output_path), "all_input_file_checksums_match": all(file_entry["checksum_match"] for file_entry in input_files), } ) secret_scan = scan_for_secret_terms([Path(__file__), output_path]) sanity["checkpoint5_secret_scan_passed"] = secret_scan["passed"] gate_checks = [ normalized_rows_written == raw_rows_read, not row_errors, sanity["raw_file_refs_present"], sanity["raw_files_exist"], sanity["spread_non_negative"], sanity["midpoint_between_bid_ask"], sanity["depth_totals_non_negative"], sanity["has_up_and_down"], gzip_ok, sanity["manifest_checksum_matches_output"], secret_scan["passed"], all(file_entry["checksum_match"] for file_entry in input_files), ] gate_status = "PASS" if all(gate_checks) and normalized_rows_written > 0 else "FAIL" ended = utc_now() manifest = { "schema_name": "orderbook_normalization_sample_manifest", "schema_version": 1, "checkpoint_id": 5, "checkpoint_name": "Normalized Snapshot Extract", "normalizer": { "name": NORMALIZER_NAME, "version": NORMALIZER_VERSION, }, "started_at_utc": iso_z(started), "ended_at_utc": iso_z(ended), "run_duration_seconds": round((ended - started).total_seconds(), 3), "command": "scripts/normalize_polymarket_orderbooks.py", "input_manifest": { "path": str(args.input_manifest), "sha256": sha256_file(args.input_manifest), "collector_manifest_schema_name": input_manifest.get("schema_name"), "collector_gate_status": input_manifest.get("gate_status"), }, "input_files": input_files, "output_files": [output_summary], "raw_rows_read": raw_rows_read, "normalized_rows_written": normalized_rows_written, "skipped_rows": len(row_errors), "error_rows": row_errors, "numeric_encoding": "Exact decimal values are emitted as JSON strings; missing price-derived values are null.", "sanity_checks": sanity, "secret_scan": secret_scan, "warnings": [], "known_gaps": [ "This is a derived sample extract only; raw gzip JSONL remains the source of truth.", "No upload, daemon runtime, systemd unit, dashboard, database, strategy, backtest, or trading behavior is included.", "The sample proves normalization logic on one bounded raw run, not long-run schema stability.", ], "fake_progress_risk": "A clean normalized sample can hide raw collection gaps and endpoint schema drift; every row is therefore traceable to raw_file and raw_line_number, and reliability remains gated on later soak testing.", "next_step": "Checkpoint 6 should package the raw collector for a VPS runtime, or the orchestrator can request review of this normalized sample first.", "gate_status": gate_status, } args.manifest_path.parent.mkdir(parents=True, exist_ok=True) args.manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") print( json.dumps( { "gate_status": gate_status, "manifest_path": str(args.manifest_path), "output_path": str(output_path), "raw_rows_read": raw_rows_read, "normalized_rows_written": normalized_rows_written, "skipped_rows": len(row_errors), "sha256": output_summary["sha256"], }, indent=2, sort_keys=True, ) ) return 0 if gate_status == "PASS" else 1 if __name__ == "__main__": raise SystemExit(main(sys.argv[1:]))