496 lines
19 KiB
Python
496 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""Normalize raw Polymarket order-book snapshots from the sample collector.
|
|
|
|
Checkpoint 5 scope: derive a bounded normalized gzip JSONL sample from the raw
|
|
Checkpoint 4 sample. Raw files remain the source of truth; every normalized row
|
|
keeps the raw file path and gzip JSONL line number.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import datetime as dt
|
|
import gzip
|
|
import hashlib
|
|
import json
|
|
import sys
|
|
from decimal import Decimal, InvalidOperation, getcontext
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
NORMALIZER_NAME = "polymarket_orderbook_normalizer"
|
|
NORMALIZER_VERSION = "0.1.0"
|
|
SCHEMA_NAME = "normalized_orderbook_snapshot"
|
|
SCHEMA_VERSION = 1
|
|
|
|
DEFAULT_INPUT_MANIFEST = Path("data/manifests/orderbook_collector_sample_manifest.json")
|
|
DEFAULT_OUTPUT_DIR = Path("data/normalized_sample")
|
|
DEFAULT_MANIFEST_PATH = Path("data/manifests/orderbook_normalization_sample_manifest.json")
|
|
|
|
CENT_OFFSETS = {
|
|
"1c": Decimal("0.01"),
|
|
"2c": Decimal("0.02"),
|
|
"5c": Decimal("0.05"),
|
|
}
|
|
|
|
SECRET_PATTERNS = (
|
|
"set-" "coo" "kie",
|
|
"__cf" "_bm",
|
|
"cf" "_bm",
|
|
"author" "ization",
|
|
"private" "_key",
|
|
"api" "_secret",
|
|
"poly" "_signature",
|
|
"poly" "_passphrase",
|
|
"poly" "_address",
|
|
"bear" "er",
|
|
"coo" "kie",
|
|
"wallet" " material",
|
|
)
|
|
|
|
|
|
getcontext().prec = 50
|
|
|
|
|
|
def utc_now() -> dt.datetime:
|
|
return dt.datetime.now(dt.UTC)
|
|
|
|
|
|
def iso_z(value: dt.datetime | None = None) -> str:
|
|
value = value or utc_now()
|
|
return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
|
|
|
|
|
def compact_timestamp(value: dt.datetime | None = None) -> str:
|
|
value = value or utc_now()
|
|
return value.astimezone(dt.UTC).strftime("%Y%m%dT%H%M%SZ")
|
|
|
|
|
|
def sha256_file(path: Path) -> str:
|
|
digest = hashlib.sha256()
|
|
with path.open("rb") as handle:
|
|
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
|
digest.update(chunk)
|
|
return digest.hexdigest()
|
|
|
|
|
|
def decimal_from_raw(value: Any, field_name: str) -> Decimal:
|
|
if not isinstance(value, str):
|
|
raise ValueError(f"{field_name} is not a string: {value!r}")
|
|
try:
|
|
parsed = Decimal(value)
|
|
except InvalidOperation as exc:
|
|
raise ValueError(f"{field_name} is not a decimal: {value!r}") from exc
|
|
if not parsed.is_finite():
|
|
raise ValueError(f"{field_name} is not finite: {value!r}")
|
|
return parsed
|
|
|
|
|
|
def decimal_to_json(value: Decimal | None) -> str | None:
|
|
if value is None:
|
|
return None
|
|
if value == 0:
|
|
return "0"
|
|
return format(value.normalize(), "f")
|
|
|
|
|
|
def load_json(path: Path) -> dict[str, Any]:
|
|
with path.open("r", encoding="utf-8") as handle:
|
|
data = json.load(handle)
|
|
if not isinstance(data, dict):
|
|
raise ValueError(f"{path} did not contain a JSON object")
|
|
return data
|
|
|
|
|
|
def resolve_repo_path(path_text: str) -> Path:
|
|
path = Path(path_text)
|
|
if path.is_absolute():
|
|
return path
|
|
return Path.cwd() / path
|
|
|
|
|
|
def normalize_side(levels: Any, side_name: str) -> list[tuple[Decimal, Decimal]]:
|
|
if not isinstance(levels, list):
|
|
raise ValueError(f"raw.{side_name} is not a list")
|
|
normalized: list[tuple[Decimal, Decimal]] = []
|
|
for index, level in enumerate(levels):
|
|
if not isinstance(level, dict):
|
|
raise ValueError(f"raw.{side_name}[{index}] is not an object")
|
|
price = decimal_from_raw(level.get("price"), f"raw.{side_name}[{index}].price")
|
|
size = decimal_from_raw(level.get("size"), f"raw.{side_name}[{index}].size")
|
|
if size < 0:
|
|
raise ValueError(f"raw.{side_name}[{index}].size is negative")
|
|
normalized.append((price, size))
|
|
return normalized
|
|
|
|
|
|
def sum_sizes(levels: list[tuple[Decimal, Decimal]]) -> Decimal:
|
|
return sum((size for _, size in levels), Decimal("0"))
|
|
|
|
|
|
def normalize_raw_row(raw_row: dict[str, Any], raw_file: str, raw_line_number: int) -> dict[str, Any]:
|
|
raw_book = raw_row.get("raw")
|
|
market = raw_row.get("market")
|
|
collection = raw_row.get("collection")
|
|
if not isinstance(raw_book, dict):
|
|
raise ValueError("raw is not an object")
|
|
if not isinstance(market, dict):
|
|
raise ValueError("market is not an object")
|
|
if not isinstance(collection, dict):
|
|
raise ValueError("collection is not an object")
|
|
|
|
bids = normalize_side(raw_book.get("bids"), "bids")
|
|
asks = normalize_side(raw_book.get("asks"), "asks")
|
|
|
|
best_bid = max((price for price, _ in bids), default=None)
|
|
best_ask = min((price for price, _ in asks), default=None)
|
|
spread = None
|
|
midpoint = None
|
|
if best_bid is not None and best_ask is not None:
|
|
spread = best_ask - best_bid
|
|
midpoint = (best_bid + best_ask) / Decimal("2")
|
|
|
|
bid_depth_total = sum_sizes(bids)
|
|
ask_depth_total = sum_sizes(asks)
|
|
|
|
row: dict[str, Any] = {
|
|
"schema_name": SCHEMA_NAME,
|
|
"schema_version": SCHEMA_VERSION,
|
|
"market_name": market.get("market_name"),
|
|
"market_slug": market.get("market_slug"),
|
|
"condition_id": market.get("condition_id"),
|
|
"token_id": market.get("token_id"),
|
|
"outcome": market.get("outcome"),
|
|
"collected_at_utc": collection.get("collected_at_utc"),
|
|
"best_bid": decimal_to_json(best_bid),
|
|
"best_ask": decimal_to_json(best_ask),
|
|
"spread": decimal_to_json(spread),
|
|
"midpoint": decimal_to_json(midpoint),
|
|
"bid_depth_total": decimal_to_json(bid_depth_total),
|
|
"ask_depth_total": decimal_to_json(ask_depth_total),
|
|
"raw_file": raw_file,
|
|
"raw_line_number": raw_line_number,
|
|
}
|
|
|
|
for label, offset in CENT_OFFSETS.items():
|
|
bid_depth = Decimal("0")
|
|
if best_bid is not None:
|
|
threshold = best_bid - offset
|
|
bid_depth = sum((size for price, size in bids if price >= threshold), Decimal("0"))
|
|
ask_depth = Decimal("0")
|
|
if best_ask is not None:
|
|
threshold = best_ask + offset
|
|
ask_depth = sum((size for price, size in asks if price <= threshold), Decimal("0"))
|
|
row[f"bid_depth_within_{label}"] = decimal_to_json(bid_depth)
|
|
row[f"ask_depth_within_{label}"] = decimal_to_json(ask_depth)
|
|
|
|
return row
|
|
|
|
|
|
def summarize_output(path: Path, rows: int) -> dict[str, Any]:
|
|
return {
|
|
"path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path),
|
|
"rows": rows,
|
|
"bytes": path.stat().st_size,
|
|
"sha256": sha256_file(path),
|
|
"status": "valid",
|
|
}
|
|
|
|
|
|
def build_input_file_summary(manifest: dict[str, Any]) -> list[dict[str, Any]]:
|
|
files = manifest.get("output_files")
|
|
if not isinstance(files, list) or not files:
|
|
raise ValueError("input manifest has no output_files")
|
|
summaries: list[dict[str, Any]] = []
|
|
for file_entry in files:
|
|
if not isinstance(file_entry, dict):
|
|
raise ValueError("input manifest output_files entry is not an object")
|
|
path_text = file_entry.get("path")
|
|
if not isinstance(path_text, str) or not path_text:
|
|
raise ValueError("input manifest output_files entry lacks path")
|
|
path = resolve_repo_path(path_text)
|
|
if not path.exists():
|
|
raise FileNotFoundError(path)
|
|
actual_sha = sha256_file(path)
|
|
expected_sha = file_entry.get("sha256")
|
|
checksum_match = expected_sha == actual_sha
|
|
summaries.append(
|
|
{
|
|
"path": path_text,
|
|
"rows_expected": file_entry.get("rows"),
|
|
"bytes": path.stat().st_size,
|
|
"sha256": actual_sha,
|
|
"input_manifest_sha256": expected_sha,
|
|
"checksum_match": checksum_match,
|
|
"status": "valid" if checksum_match else "invalid",
|
|
}
|
|
)
|
|
return summaries
|
|
|
|
|
|
def read_and_normalize(
|
|
input_files: list[dict[str, Any]],
|
|
output_path: Path,
|
|
) -> tuple[int, int, list[dict[str, Any]], dict[str, Any]]:
|
|
raw_rows_read = 0
|
|
normalized_rows_written = 0
|
|
errors: list[dict[str, Any]] = []
|
|
sanity = {
|
|
"raw_file_refs_present": True,
|
|
"raw_files_exist": True,
|
|
"spread_non_negative": True,
|
|
"midpoint_between_bid_ask": True,
|
|
"depth_totals_non_negative": True,
|
|
"outcomes_seen": [],
|
|
"gzip_jsonl_parseable": True,
|
|
"row_count_match": None,
|
|
}
|
|
outcomes_seen: set[str] = set()
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with gzip.open(output_path, "wt", encoding="utf-8", compresslevel=9) as output:
|
|
for file_entry in input_files:
|
|
raw_file = file_entry["path"]
|
|
raw_path = resolve_repo_path(raw_file)
|
|
if not raw_path.exists():
|
|
sanity["raw_files_exist"] = False
|
|
errors.append({"raw_file": raw_file, "error": "raw file missing"})
|
|
continue
|
|
|
|
with gzip.open(raw_path, "rt", encoding="utf-8") as raw_handle:
|
|
for raw_line_number, line in enumerate(raw_handle, 1):
|
|
raw_rows_read += 1
|
|
try:
|
|
raw_row = json.loads(line)
|
|
normalized = normalize_raw_row(raw_row, raw_file, raw_line_number)
|
|
output.write(json.dumps(normalized, sort_keys=True, separators=(",", ":")) + "\n")
|
|
normalized_rows_written += 1
|
|
|
|
if not normalized.get("raw_file") or not normalized.get("raw_line_number"):
|
|
sanity["raw_file_refs_present"] = False
|
|
if not resolve_repo_path(str(normalized["raw_file"])).exists():
|
|
sanity["raw_files_exist"] = False
|
|
outcome = normalized.get("outcome")
|
|
if isinstance(outcome, str):
|
|
outcomes_seen.add(outcome)
|
|
|
|
best_bid = Decimal(normalized["best_bid"]) if normalized["best_bid"] is not None else None
|
|
best_ask = Decimal(normalized["best_ask"]) if normalized["best_ask"] is not None else None
|
|
spread = Decimal(normalized["spread"]) if normalized["spread"] is not None else None
|
|
midpoint = Decimal(normalized["midpoint"]) if normalized["midpoint"] is not None else None
|
|
if best_bid is not None and best_ask is not None:
|
|
if spread is None or spread < 0:
|
|
sanity["spread_non_negative"] = False
|
|
if midpoint is None or midpoint < best_bid or midpoint > best_ask:
|
|
sanity["midpoint_between_bid_ask"] = False
|
|
depth_fields = [
|
|
"bid_depth_total",
|
|
"ask_depth_total",
|
|
"bid_depth_within_1c",
|
|
"ask_depth_within_1c",
|
|
"bid_depth_within_2c",
|
|
"ask_depth_within_2c",
|
|
"bid_depth_within_5c",
|
|
"ask_depth_within_5c",
|
|
]
|
|
for field in depth_fields:
|
|
if Decimal(normalized[field]) < 0:
|
|
sanity["depth_totals_non_negative"] = False
|
|
except Exception as exc: # noqa: BLE001 - preserve row-level failure evidence.
|
|
errors.append(
|
|
{
|
|
"raw_file": raw_file,
|
|
"raw_line_number": raw_line_number,
|
|
"error": str(exc),
|
|
}
|
|
)
|
|
|
|
sanity["outcomes_seen"] = sorted(outcomes_seen)
|
|
sanity["has_up_and_down"] = {"Up", "Down"}.issubset(outcomes_seen)
|
|
sanity["row_count_match"] = raw_rows_read == normalized_rows_written + len(errors)
|
|
return raw_rows_read, normalized_rows_written, errors, sanity
|
|
|
|
|
|
def validate_output_gzip_jsonl(path: Path) -> tuple[bool, int, list[str]]:
|
|
errors: list[str] = []
|
|
parsed_rows = 0
|
|
try:
|
|
with gzip.open(path, "rt", encoding="utf-8") as handle:
|
|
for line_number, line in enumerate(handle, 1):
|
|
json.loads(line)
|
|
parsed_rows = line_number
|
|
except Exception as exc: # noqa: BLE001 - validation result belongs in manifest.
|
|
errors.append(str(exc))
|
|
return not errors, parsed_rows, errors
|
|
|
|
|
|
def scan_for_secret_terms(paths: list[Path]) -> dict[str, Any]:
|
|
matches: list[dict[str, Any]] = []
|
|
lowered_patterns = tuple(pattern.lower() for pattern in SECRET_PATTERNS)
|
|
for path in paths:
|
|
if not path.exists():
|
|
continue
|
|
if path.suffix == ".gz":
|
|
opener = gzip.open
|
|
else:
|
|
opener = open
|
|
with opener(path, "rt", encoding="utf-8", errors="replace") as handle: # type: ignore[arg-type]
|
|
for line_number, line in enumerate(handle, 1):
|
|
lower = line.lower()
|
|
for pattern_index, pattern in enumerate(lowered_patterns, 1):
|
|
if pattern in lower:
|
|
matches.append(
|
|
{
|
|
"path": str(path.relative_to(Path.cwd()) if path.is_absolute() else path),
|
|
"line_number": line_number,
|
|
"term_index": pattern_index,
|
|
}
|
|
)
|
|
break
|
|
return {
|
|
"passed": not matches,
|
|
"checked_term_count": len(SECRET_PATTERNS),
|
|
"matches": matches,
|
|
}
|
|
|
|
|
|
def parse_args(argv: list[str]) -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Normalize Checkpoint 4 raw Polymarket order-book snapshots.",
|
|
)
|
|
parser.add_argument(
|
|
"--input-manifest",
|
|
type=Path,
|
|
default=DEFAULT_INPUT_MANIFEST,
|
|
help=f"Raw collector manifest path. Default: {DEFAULT_INPUT_MANIFEST}",
|
|
)
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
type=Path,
|
|
default=DEFAULT_OUTPUT_DIR,
|
|
help=f"Normalized sample base directory. Default: {DEFAULT_OUTPUT_DIR}",
|
|
)
|
|
parser.add_argument(
|
|
"--manifest-path",
|
|
type=Path,
|
|
default=DEFAULT_MANIFEST_PATH,
|
|
help=f"Normalization manifest path. Default: {DEFAULT_MANIFEST_PATH}",
|
|
)
|
|
return parser.parse_args(argv)
|
|
|
|
|
|
def main(argv: list[str]) -> int:
|
|
args = parse_args(argv)
|
|
started = utc_now()
|
|
input_manifest = load_json(args.input_manifest)
|
|
input_files = build_input_file_summary(input_manifest)
|
|
|
|
run_id = compact_timestamp(started)
|
|
output_path = (
|
|
args.output_dir
|
|
/ "polymarket"
|
|
/ "orderbooks"
|
|
/ run_id
|
|
/ f"polymarket_orderbooks_normalized_{run_id}.jsonl.gz"
|
|
)
|
|
|
|
raw_rows_read, normalized_rows_written, row_errors, sanity = read_and_normalize(input_files, output_path)
|
|
gzip_ok, gzip_rows, gzip_errors = validate_output_gzip_jsonl(output_path)
|
|
output_summary = summarize_output(output_path, normalized_rows_written)
|
|
|
|
sanity.update(
|
|
{
|
|
"output_row_count_equals_raw_input_row_count": normalized_rows_written == raw_rows_read
|
|
if not row_errors
|
|
else False,
|
|
"gzip_jsonl_decompresses_and_parses": gzip_ok,
|
|
"gzip_jsonl_rows_parsed": gzip_rows,
|
|
"gzip_jsonl_errors": gzip_errors,
|
|
"manifest_checksum_matches_output": output_summary["sha256"] == sha256_file(output_path),
|
|
"all_input_file_checksums_match": all(file_entry["checksum_match"] for file_entry in input_files),
|
|
}
|
|
)
|
|
|
|
secret_scan = scan_for_secret_terms([Path(__file__), output_path])
|
|
sanity["checkpoint5_secret_scan_passed"] = secret_scan["passed"]
|
|
|
|
gate_checks = [
|
|
normalized_rows_written == raw_rows_read,
|
|
not row_errors,
|
|
sanity["raw_file_refs_present"],
|
|
sanity["raw_files_exist"],
|
|
sanity["spread_non_negative"],
|
|
sanity["midpoint_between_bid_ask"],
|
|
sanity["depth_totals_non_negative"],
|
|
sanity["has_up_and_down"],
|
|
gzip_ok,
|
|
sanity["manifest_checksum_matches_output"],
|
|
secret_scan["passed"],
|
|
all(file_entry["checksum_match"] for file_entry in input_files),
|
|
]
|
|
gate_status = "PASS" if all(gate_checks) and normalized_rows_written > 0 else "FAIL"
|
|
ended = utc_now()
|
|
|
|
manifest = {
|
|
"schema_name": "orderbook_normalization_sample_manifest",
|
|
"schema_version": 1,
|
|
"checkpoint_id": 5,
|
|
"checkpoint_name": "Normalized Snapshot Extract",
|
|
"normalizer": {
|
|
"name": NORMALIZER_NAME,
|
|
"version": NORMALIZER_VERSION,
|
|
},
|
|
"started_at_utc": iso_z(started),
|
|
"ended_at_utc": iso_z(ended),
|
|
"run_duration_seconds": round((ended - started).total_seconds(), 3),
|
|
"command": "scripts/normalize_polymarket_orderbooks.py",
|
|
"input_manifest": {
|
|
"path": str(args.input_manifest),
|
|
"sha256": sha256_file(args.input_manifest),
|
|
"collector_manifest_schema_name": input_manifest.get("schema_name"),
|
|
"collector_gate_status": input_manifest.get("gate_status"),
|
|
},
|
|
"input_files": input_files,
|
|
"output_files": [output_summary],
|
|
"raw_rows_read": raw_rows_read,
|
|
"normalized_rows_written": normalized_rows_written,
|
|
"skipped_rows": len(row_errors),
|
|
"error_rows": row_errors,
|
|
"numeric_encoding": "Exact decimal values are emitted as JSON strings; missing price-derived values are null.",
|
|
"sanity_checks": sanity,
|
|
"secret_scan": secret_scan,
|
|
"warnings": [],
|
|
"known_gaps": [
|
|
"This is a derived sample extract only; raw gzip JSONL remains the source of truth.",
|
|
"No upload, daemon runtime, systemd unit, dashboard, database, strategy, backtest, or trading behavior is included.",
|
|
"The sample proves normalization logic on one bounded raw run, not long-run schema stability.",
|
|
],
|
|
"fake_progress_risk": "A clean normalized sample can hide raw collection gaps and endpoint schema drift; every row is therefore traceable to raw_file and raw_line_number, and reliability remains gated on later soak testing.",
|
|
"next_step": "Checkpoint 6 should package the raw collector for a VPS runtime, or the orchestrator can request review of this normalized sample first.",
|
|
"gate_status": gate_status,
|
|
}
|
|
|
|
args.manifest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
args.manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
|
|
|
print(
|
|
json.dumps(
|
|
{
|
|
"gate_status": gate_status,
|
|
"manifest_path": str(args.manifest_path),
|
|
"output_path": str(output_path),
|
|
"raw_rows_read": raw_rows_read,
|
|
"normalized_rows_written": normalized_rows_written,
|
|
"skipped_rows": len(row_errors),
|
|
"sha256": output_summary["sha256"],
|
|
},
|
|
indent=2,
|
|
sort_keys=True,
|
|
)
|
|
)
|
|
return 0 if gate_status == "PASS" else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main(sys.argv[1:]))
|