orderbooks/scripts/normalize_polymarket_orderbooks.py

#!/usr/bin/env python3
"""Normalize raw Polymarket order-book snapshots from the sample collector.

Checkpoint 5 scope: derive a bounded normalized gzip JSONL sample from the raw
Checkpoint 4 sample. Raw files remain the source of truth; every normalized row
keeps the raw file path and gzip JSONL line number.
"""

from __future__ import annotations

import argparse
import datetime as dt
import gzip
import hashlib
import json
import sys
from decimal import Decimal, InvalidOperation, getcontext
from pathlib import Path
from typing import Any


NORMALIZER_NAME = "polymarket_orderbook_normalizer"
NORMALIZER_VERSION = "0.1.0"
SCHEMA_NAME = "normalized_orderbook_snapshot"
SCHEMA_VERSION = 1

DEFAULT_INPUT_MANIFEST = Path("data/manifests/orderbook_collector_sample_manifest.json")
DEFAULT_OUTPUT_DIR = Path("data/normalized_sample")
DEFAULT_MANIFEST_PATH = Path("data/manifests/orderbook_normalization_sample_manifest.json")

CENT_OFFSETS = {
    "1c": Decimal("0.01"),
    "2c": Decimal("0.02"),
    "5c": Decimal("0.05"),
}

SECRET_PATTERNS = (
    "set-" "coo" "kie",
    "__cf" "_bm",
    "cf" "_bm",
    "author" "ization",
    "private" "_key",
    "api" "_secret",
    "poly" "_signature",
    "poly" "_passphrase",
    "poly" "_address",
    "bear" "er",
    "coo" "kie",
    "wallet" " material",
)


getcontext().prec = 50


def utc_now() -> dt.datetime:
    return dt.datetime.now(dt.UTC)


def iso_z(value: dt.datetime | None = None) -> str:
    value = value or utc_now()
    return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")


def compact_timestamp(value: dt.datetime | None = None) -> str:
    value = value or utc_now()
    return value.astimezone(dt.UTC).strftime("%Y%m%dT%H%M%SZ")


def sha256_file(path: Path) -> str:
    digest = hashlib.sha256()
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            digest.update(chunk)
    return digest.hexdigest()


def decimal_from_raw(value: Any, field_name: str) -> Decimal:
    if not isinstance(value, str):
        raise ValueError(f"{field_name} is not a string: {value!r}")
    try:
        parsed = Decimal(value)
    except InvalidOperation as exc:
        raise ValueError(f"{field_name} is not a decimal: {value!r}") from exc
    if not parsed.is_finite():
        raise ValueError(f"{field_name} is not finite: {value!r}")
    return parsed


def decimal_to_json(value: Decimal | None) -> str | None:
    if value is None:
        return None
    if value == 0:
        return "0"
    return format(value.normalize(), "f")


def load_json(path: Path) -> dict[str, Any]:
    with path.open("r", encoding="utf-8") as handle:
        data = json.load(handle)
    if not isinstance(data, dict):
        raise ValueError(f"{path} did not contain a JSON object")
    return data


def resolve_repo_path(path_text: str) -> Path:
    path = Path(path_text)
    if path.is_absolute():
        return path
    return Path.cwd() / path


def normalize_side(levels: Any, side_name: str) -> list[tuple[Decimal, Decimal]]:
    if not isinstance(levels, list):
        raise ValueError(f"raw.{side_name} is not a list")
    normalized: list[tuple[Decimal, Decimal]] = []
    for index, level in enumerate(levels):
        if not isinstance(level, dict):
            raise ValueError(f"raw.{side_name}[{index}] is not an object")
        price = decimal_from_raw(level.get("price"), f"raw.{side_name}[{index}].price")
        size = decimal_from_raw(level.get("size"), f"raw.{side_name}[{index}].size")
        if size < 0:
            raise ValueError(f"raw.{side_name}[{index}].size is negative")
        normalized.append((price, size))
    return normalized


def sum_sizes(levels: list[tuple[Decimal, Decimal]]) -> Decimal:
    return sum((size for _, size in levels), Decimal("0"))


def normalize_raw_row(raw_row: dict[str, Any], raw_file: str, raw_line_number: int) -> dict[str, Any]:
    raw_book = raw_row.get("raw")
    market = raw_row.get("market")
    collection = raw_row.get("collection")
    if not isinstance(raw_book, dict):
        raise ValueError("raw is not an object")
    if not isinstance(market, dict):
        raise ValueError("market is not an object")
    if not isinstance(collection, dict):
        raise ValueError("collection is not an object")

    bids = normalize_side(raw_book.get("bids"), "bids")
    asks = normalize_side(raw_book.get("asks"), "asks")

    best_bid = max((price for price, _ in bids), default=None)
    best_ask = min((price for price, _ in asks), default=None)
    spread = None
    midpoint = None
    if best_bid is not None and best_ask is not None:
        spread = best_ask - best_bid
        midpoint = (best_bid + best_ask) / Decimal("2")

    bid_depth_total = sum_sizes(bids)
    ask_depth_total = sum_sizes(asks)

    row: dict[str, Any] = {
        "schema_name": SCHEMA_NAME,
        "schema_version": SCHEMA_VERSION,
        "market_name": market.get("market_name"),
        "market_slug": market.get("market_slug"),
        "condition_id": market.get("condition_id"),
        "token_id": market.get("token_id"),
        "outcome": market.get("outcome"),
        "collected_at_utc": collection.get("collected_at_utc"),
        "best_bid": decimal_to_json(best_bid),
        "best_ask": decimal_to_json(best_ask),
        "spread": decimal_to_json(spread),
        "midpoint": decimal_to_json(midpoint),
        "bid_depth_total": decimal_to_json(bid_depth_total),
        "ask_depth_total": decimal_to_json(ask_depth_total),
        "raw_file": raw_file,
        "raw_line_number": raw_line_number,
    }

    for label, offset in CENT_OFFSETS.items():
        bid_depth = Decimal("0")
        if best_bid is not None:
            threshold = best_bid - offset
            bid_depth = sum((size for price, size in bids if price >= threshold), Decimal("0"))
        ask_depth = Decimal("0")
        if best_ask is not None:
            threshold = best_ask + offset
            ask_depth = sum((size for price, size in asks if price <= threshold), Decimal("0"))
        row[f"bid_depth_within_{label}"] = decimal_to_json(bid_depth)
        row[f"ask_depth_within_{label}"] = decimal_to_json(ask_depth)

    return row


def summarize_output(path: Path, rows: int) -> dict[str, Any]:
    return {
        "path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path),
        "rows": rows,
        "bytes": path.stat().st_size,
        "sha256": sha256_file(path),
        "status": "valid",
    }


def build_input_file_summary(manifest: dict[str, Any]) -> list[dict[str, Any]]:
    files = manifest.get("output_files")
    if not isinstance(files, list) or not files:
        raise ValueError("input manifest has no output_files")
    summaries: list[dict[str, Any]] = []
    for file_entry in files:
        if not isinstance(file_entry, dict):
            raise ValueError("input manifest output_files entry is not an object")
        path_text = file_entry.get("path")
        if not isinstance(path_text, str) or not path_text:
            raise ValueError("input manifest output_files entry lacks path")
        path = resolve_repo_path(path_text)
        if not path.exists():
            raise FileNotFoundError(path)
        actual_sha = sha256_file(path)
        expected_sha = file_entry.get("sha256")
        checksum_match = expected_sha == actual_sha
        summaries.append(
            {
                "path": path_text,
                "rows_expected": file_entry.get("rows"),
                "bytes": path.stat().st_size,
                "sha256": actual_sha,
                "input_manifest_sha256": expected_sha,
                "checksum_match": checksum_match,
                "status": "valid" if checksum_match else "invalid",
            }
        )
    return summaries


def read_and_normalize(
    input_files: list[dict[str, Any]],
    output_path: Path,
) -> tuple[int, int, list[dict[str, Any]], dict[str, Any]]:
    raw_rows_read = 0
    normalized_rows_written = 0
    errors: list[dict[str, Any]] = []
    sanity = {
        "raw_file_refs_present": True,
        "raw_files_exist": True,
        "spread_non_negative": True,
        "midpoint_between_bid_ask": True,
        "depth_totals_non_negative": True,
        "outcomes_seen": [],
        "gzip_jsonl_parseable": True,
        "row_count_match": None,
    }
    outcomes_seen: set[str] = set()

    output_path.parent.mkdir(parents=True, exist_ok=True)
    with gzip.open(output_path, "wt", encoding="utf-8", compresslevel=9) as output:
        for file_entry in input_files:
            raw_file = file_entry["path"]
            raw_path = resolve_repo_path(raw_file)
            if not raw_path.exists():
                sanity["raw_files_exist"] = False
                errors.append({"raw_file": raw_file, "error": "raw file missing"})
                continue

            with gzip.open(raw_path, "rt", encoding="utf-8") as raw_handle:
                for raw_line_number, line in enumerate(raw_handle, 1):
                    raw_rows_read += 1
                    try:
                        raw_row = json.loads(line)
                        normalized = normalize_raw_row(raw_row, raw_file, raw_line_number)
                        output.write(json.dumps(normalized, sort_keys=True, separators=(",", ":")) + "\n")
                        normalized_rows_written += 1

                        if not normalized.get("raw_file") or not normalized.get("raw_line_number"):
                            sanity["raw_file_refs_present"] = False
                        if not resolve_repo_path(str(normalized["raw_file"])).exists():
                            sanity["raw_files_exist"] = False
                        outcome = normalized.get("outcome")
                        if isinstance(outcome, str):
                            outcomes_seen.add(outcome)

                        best_bid = Decimal(normalized["best_bid"]) if normalized["best_bid"] is not None else None
                        best_ask = Decimal(normalized["best_ask"]) if normalized["best_ask"] is not None else None
                        spread = Decimal(normalized["spread"]) if normalized["spread"] is not None else None
                        midpoint = Decimal(normalized["midpoint"]) if normalized["midpoint"] is not None else None
                        if best_bid is not None and best_ask is not None:
                            if spread is None or spread < 0:
                                sanity["spread_non_negative"] = False
                            if midpoint is None or midpoint < best_bid or midpoint > best_ask:
                                sanity["midpoint_between_bid_ask"] = False
                        depth_fields = [
                            "bid_depth_total",
                            "ask_depth_total",
                            "bid_depth_within_1c",
                            "ask_depth_within_1c",
                            "bid_depth_within_2c",
                            "ask_depth_within_2c",
                            "bid_depth_within_5c",
                            "ask_depth_within_5c",
                        ]
                        for field in depth_fields:
                            if Decimal(normalized[field]) < 0:
                                sanity["depth_totals_non_negative"] = False
                    except Exception as exc:  # noqa: BLE001 - preserve row-level failure evidence.
                        errors.append(
                            {
                                "raw_file": raw_file,
                                "raw_line_number": raw_line_number,
                                "error": str(exc),
                            }
                        )

    sanity["outcomes_seen"] = sorted(outcomes_seen)
    sanity["has_up_and_down"] = {"Up", "Down"}.issubset(outcomes_seen)
    sanity["row_count_match"] = raw_rows_read == normalized_rows_written + len(errors)
    return raw_rows_read, normalized_rows_written, errors, sanity


def validate_output_gzip_jsonl(path: Path) -> tuple[bool, int, list[str]]:
    errors: list[str] = []
    parsed_rows = 0
    try:
        with gzip.open(path, "rt", encoding="utf-8") as handle:
            for line_number, line in enumerate(handle, 1):
                json.loads(line)
                parsed_rows = line_number
    except Exception as exc:  # noqa: BLE001 - validation result belongs in manifest.
        errors.append(str(exc))
    return not errors, parsed_rows, errors


def scan_for_secret_terms(paths: list[Path]) -> dict[str, Any]:
    matches: list[dict[str, Any]] = []
    lowered_patterns = tuple(pattern.lower() for pattern in SECRET_PATTERNS)
    for path in paths:
        if not path.exists():
            continue
        if path.suffix == ".gz":
            opener = gzip.open
        else:
            opener = open
        with opener(path, "rt", encoding="utf-8", errors="replace") as handle:  # type: ignore[arg-type]
            for line_number, line in enumerate(handle, 1):
                lower = line.lower()
                for pattern_index, pattern in enumerate(lowered_patterns, 1):
                    if pattern in lower:
                        matches.append(
                            {
                                "path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path),
                                "line_number": line_number,
                                "term_index": pattern_index,
                            }
                        )
                        break
    return {
        "passed": not matches,
        "checked_term_count": len(SECRET_PATTERNS),
        "matches": matches,
    }


def parse_args(argv: list[str]) -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Normalize Checkpoint 4 raw Polymarket order-book snapshots.",
    )
    parser.add_argument(
        "--input-manifest",
        type=Path,
        default=DEFAULT_INPUT_MANIFEST,
        help=f"Raw collector manifest path. Default: {DEFAULT_INPUT_MANIFEST}",
    )
    parser.add_argument(
        "--output-dir",
        type=Path,
        default=DEFAULT_OUTPUT_DIR,
        help=f"Normalized sample base directory. Default: {DEFAULT_OUTPUT_DIR}",
    )
    parser.add_argument(
        "--manifest-path",
        type=Path,
        default=DEFAULT_MANIFEST_PATH,
        help=f"Normalization manifest path. Default: {DEFAULT_MANIFEST_PATH}",
    )
    return parser.parse_args(argv)


def main(argv: list[str]) -> int:
    args = parse_args(argv)
    started = utc_now()
    input_manifest = load_json(args.input_manifest)
    input_files = build_input_file_summary(input_manifest)

    run_id = compact_timestamp(started)
    output_path = (
        args.output_dir
        / "polymarket"
        / "orderbooks"
        / run_id
        / f"polymarket_orderbooks_normalized_{run_id}.jsonl.gz"
    )

    raw_rows_read, normalized_rows_written, row_errors, sanity = read_and_normalize(input_files, output_path)
    gzip_ok, gzip_rows, gzip_errors = validate_output_gzip_jsonl(output_path)
    output_summary = summarize_output(output_path, normalized_rows_written)

    sanity.update(
        {
            "output_row_count_equals_raw_input_row_count": normalized_rows_written == raw_rows_read
            if not row_errors
            else False,
            "gzip_jsonl_decompresses_and_parses": gzip_ok,
            "gzip_jsonl_rows_parsed": gzip_rows,
            "gzip_jsonl_errors": gzip_errors,
            "manifest_checksum_matches_output": output_summary["sha256"] == sha256_file(output_path),
            "all_input_file_checksums_match": all(file_entry["checksum_match"] for file_entry in input_files),
        }
    )

    secret_scan = scan_for_secret_terms([Path(__file__), output_path])
    sanity["checkpoint5_secret_scan_passed"] = secret_scan["passed"]

    gate_checks = [
        normalized_rows_written == raw_rows_read,
        not row_errors,
        sanity["raw_file_refs_present"],
        sanity["raw_files_exist"],
        sanity["spread_non_negative"],
        sanity["midpoint_between_bid_ask"],
        sanity["depth_totals_non_negative"],
        sanity["has_up_and_down"],
        gzip_ok,
        sanity["manifest_checksum_matches_output"],
        secret_scan["passed"],
        all(file_entry["checksum_match"] for file_entry in input_files),
    ]
    gate_status = "PASS" if all(gate_checks) and normalized_rows_written > 0 else "FAIL"
    ended = utc_now()

    manifest = {
        "schema_name": "orderbook_normalization_sample_manifest",
        "schema_version": 1,
        "checkpoint_id": 5,
        "checkpoint_name": "Normalized Snapshot Extract",
        "normalizer": {
            "name": NORMALIZER_NAME,
            "version": NORMALIZER_VERSION,
        },
        "started_at_utc": iso_z(started),
        "ended_at_utc": iso_z(ended),
        "run_duration_seconds": round((ended - started).total_seconds(), 3),
        "command": "scripts/normalize_polymarket_orderbooks.py",
        "input_manifest": {
            "path": str(args.input_manifest),
            "sha256": sha256_file(args.input_manifest),
            "collector_manifest_schema_name": input_manifest.get("schema_name"),
            "collector_gate_status": input_manifest.get("gate_status"),
        },
        "input_files": input_files,
        "output_files": [output_summary],
        "raw_rows_read": raw_rows_read,
        "normalized_rows_written": normalized_rows_written,
        "skipped_rows": len(row_errors),
        "error_rows": row_errors,
        "numeric_encoding": "Exact decimal values are emitted as JSON strings; missing price-derived values are null.",
        "sanity_checks": sanity,
        "secret_scan": secret_scan,
        "warnings": [],
        "known_gaps": [
            "This is a derived sample extract only; raw gzip JSONL remains the source of truth.",
            "No upload, daemon runtime, systemd unit, dashboard, database, strategy, backtest, or trading behavior is included.",
            "The sample proves normalization logic on one bounded raw run, not long-run schema stability.",
        ],
        "fake_progress_risk": "A clean normalized sample can hide raw collection gaps and endpoint schema drift; every row is therefore traceable to raw_file and raw_line_number, and reliability remains gated on later soak testing.",
        "next_step": "Checkpoint 6 should package the raw collector for a VPS runtime, or the orchestrator can request review of this normalized sample first.",
        "gate_status": gate_status,
    }

    args.manifest_path.parent.mkdir(parents=True, exist_ok=True)
    args.manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")

    print(
        json.dumps(
            {
                "gate_status": gate_status,
                "manifest_path": str(args.manifest_path),
                "output_path": str(output_path),
                "raw_rows_read": raw_rows_read,
                "normalized_rows_written": normalized_rows_written,
                "skipped_rows": len(row_errors),
                "sha256": output_summary["sha256"],
            },
            indent=2,
            sort_keys=True,
        )
    )
    return 0 if gate_status == "PASS" else 1


if __name__ == "__main__":
    raise SystemExit(main(sys.argv[1:]))