685 lines
34 KiB
Python
Executable file
685 lines
34 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""Offline Polymarket websocket book reconstruction and REST comparison.
|
|
|
|
Checkpoint 10C scope: read raw 10B sample files, derive local per-token order
|
|
book state, and compare against REST /books checkpoints. Raw files remain the
|
|
source of truth and are not modified.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import datetime as dt
|
|
import gzip
|
|
import hashlib
|
|
import json
|
|
from copy import deepcopy
|
|
from decimal import Decimal, InvalidOperation
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
RECONSTRUCTOR_NAME = "polymarket_ws_book_reconstructor"
|
|
RECONSTRUCTOR_VERSION = "0.1.1"
|
|
DEFAULT_INPUT_MANIFEST = Path("data/manifests/checkpoint_010b_ws_raw_sample.json")
|
|
DEFAULT_MANIFEST_PATH = Path("data/manifests/checkpoint_010c_book_reconstruction_sample.json")
|
|
DEFAULT_REPORT_PATH = Path("reports/checkpoints/checkpoint_010c_book_reconstruction_sample.md")
|
|
DEFAULT_SCHEMA_DOC = Path("docs/POLYMARKET_WEBSOCKET_SCHEMA.md")
|
|
DEFAULT_RECON_DOC = Path("docs/BOOK_RECONSTRUCTION.md")
|
|
DEFAULT_OUTPUT_ROOT = Path("data/reconstruction_sample")
|
|
|
|
|
|
class BookState:
|
|
def __init__(self, token_meta: dict[str, Any]) -> None:
|
|
self.token_meta = token_meta
|
|
self.bids: dict[str, Decimal] = {}
|
|
self.asks: dict[str, Decimal] = {}
|
|
self.initialized = False
|
|
self.messages_applied = 0
|
|
self.messages_skipped = 0
|
|
self.unknown_messages = 0
|
|
self.last_update_received_at_utc: str | None = None
|
|
self.book_message_count = 0
|
|
self.price_change_count = 0
|
|
self.best_bid_ask_count = 0
|
|
self.last_trade_price_count = 0
|
|
self.warnings: list[str] = []
|
|
|
|
def clone_summary(self, top_n: int) -> dict[str, Any]:
|
|
bids = sorted(self.bids.items(), key=lambda item: Decimal(item[0]), reverse=True)
|
|
asks = sorted(self.asks.items(), key=lambda item: Decimal(item[0]))
|
|
best_bid = bids[0][0] if bids else None
|
|
best_ask = asks[0][0] if asks else None
|
|
spread = dec_to_str(Decimal(best_ask) - Decimal(best_bid)) if best_bid and best_ask else None
|
|
return {
|
|
"token": self.token_meta,
|
|
"initialized": self.initialized,
|
|
"messages_applied": self.messages_applied,
|
|
"messages_skipped": self.messages_skipped,
|
|
"unknown_messages": self.unknown_messages,
|
|
"last_update_received_at_utc": self.last_update_received_at_utc,
|
|
"state_quality": self.state_quality(),
|
|
"bid_level_count": len(self.bids),
|
|
"ask_level_count": len(self.asks),
|
|
"best_bid": best_bid,
|
|
"best_ask": best_ask,
|
|
"spread": spread,
|
|
"top_bids": [{"price": price, "size": dec_to_str(size)} for price, size in bids[:top_n]],
|
|
"top_asks": [{"price": price, "size": dec_to_str(size)} for price, size in asks[:top_n]],
|
|
"event_counters": {
|
|
"book": self.book_message_count,
|
|
"price_change": self.price_change_count,
|
|
"best_bid_ask": self.best_bid_ask_count,
|
|
"last_trade_price": self.last_trade_price_count,
|
|
},
|
|
"warnings": self.warnings,
|
|
}
|
|
|
|
def state_quality(self) -> str:
|
|
if not self.initialized:
|
|
return "insufficient_events"
|
|
if self.price_change_count > 0:
|
|
return "initialized_and_updated"
|
|
return "snapshot_only"
|
|
|
|
|
|
def utc_now() -> dt.datetime:
|
|
return dt.datetime.now(dt.UTC)
|
|
|
|
|
|
def iso_z(value: dt.datetime | None = None) -> str:
|
|
value = value or utc_now()
|
|
return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
|
|
|
|
|
def parse_iso(value: str | None) -> dt.datetime | None:
|
|
if not value:
|
|
return None
|
|
text = value[:-1] + "+00:00" if value.endswith("Z") else value
|
|
try:
|
|
parsed = dt.datetime.fromisoformat(text)
|
|
except ValueError:
|
|
return None
|
|
if parsed.tzinfo is None:
|
|
parsed = parsed.replace(tzinfo=dt.UTC)
|
|
return parsed.astimezone(dt.UTC)
|
|
|
|
|
|
def sha256_file(path: Path) -> str:
|
|
digest = hashlib.sha256()
|
|
with path.open("rb") as handle:
|
|
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
|
digest.update(chunk)
|
|
return digest.hexdigest()
|
|
|
|
|
|
def dec(value: Any) -> Decimal:
|
|
if value is None:
|
|
return Decimal("0")
|
|
try:
|
|
return Decimal(str(value))
|
|
except InvalidOperation:
|
|
return Decimal("0")
|
|
|
|
|
|
def dec_to_str(value: Decimal) -> str:
|
|
text = format(value, "f")
|
|
if "." in text:
|
|
text = text.rstrip("0").rstrip(".")
|
|
return text or "0"
|
|
|
|
|
|
def level_map(levels: Any) -> dict[str, Decimal]:
|
|
result: dict[str, Decimal] = {}
|
|
if not isinstance(levels, list):
|
|
return result
|
|
for item in levels:
|
|
if not isinstance(item, dict):
|
|
continue
|
|
price = str(item.get("price"))
|
|
size = dec(item.get("size"))
|
|
result[dec_to_str(dec(price))] = size
|
|
return result
|
|
|
|
|
|
def classify_item(item: Any) -> str:
|
|
if not isinstance(item, dict):
|
|
return type(item).__name__
|
|
event_type = item.get("event_type")
|
|
if event_type:
|
|
return str(event_type)
|
|
if {"market", "asset_id", "bids", "asks", "timestamp"}.issubset(item.keys()):
|
|
return "book_without_event_type"
|
|
return "unknown_object"
|
|
|
|
|
|
def raw_event_items(row: dict[str, Any]) -> list[dict[str, Any]]:
|
|
payload = row.get("json")
|
|
items = payload if isinstance(payload, list) else [payload]
|
|
return [item for item in items if isinstance(item, dict)]
|
|
|
|
|
|
def apply_book_item(state: BookState, item: dict[str, Any], received_at_utc: str) -> None:
|
|
state.bids = level_map(item.get("bids"))
|
|
state.asks = level_map(item.get("asks"))
|
|
state.initialized = True
|
|
state.messages_applied += 1
|
|
state.book_message_count += 1
|
|
state.last_update_received_at_utc = received_at_utc
|
|
|
|
|
|
def apply_price_change(state_by_token: dict[str, BookState], item: dict[str, Any], received_at_utc: str, warnings: list[str]) -> None:
|
|
changes = item.get("price_changes")
|
|
if not isinstance(changes, list):
|
|
warnings.append("price_change event without price_changes list")
|
|
return
|
|
for change in changes:
|
|
if not isinstance(change, dict):
|
|
continue
|
|
token_id = str(change.get("asset_id") or "")
|
|
state = state_by_token.get(token_id)
|
|
if state is None:
|
|
continue
|
|
if not state.initialized:
|
|
state.messages_skipped += 1
|
|
state.warnings.append("price_change skipped before initial book snapshot")
|
|
continue
|
|
side = str(change.get("side") or "").upper()
|
|
price = dec_to_str(dec(change.get("price")))
|
|
size = dec(change.get("size"))
|
|
if side == "BUY":
|
|
book_side = state.bids
|
|
elif side == "SELL":
|
|
book_side = state.asks
|
|
else:
|
|
state.messages_skipped += 1
|
|
state.warnings.append(f"unsupported price_change side {side!r}")
|
|
continue
|
|
if size == 0:
|
|
book_side.pop(price, None)
|
|
else:
|
|
book_side[price] = size
|
|
state.messages_applied += 1
|
|
state.price_change_count += 1
|
|
state.last_update_received_at_utc = received_at_utc
|
|
|
|
|
|
def apply_ws_row(
|
|
row: dict[str, Any],
|
|
state_by_token: dict[str, BookState],
|
|
event_type_counts: dict[str, int],
|
|
unsupported_counts: dict[str, int],
|
|
warnings: list[str],
|
|
) -> None:
|
|
received_at_utc = row.get("received_at_utc")
|
|
for item in raw_event_items(row):
|
|
event_type = classify_item(item)
|
|
event_type_counts[event_type] = event_type_counts.get(event_type, 0) + 1
|
|
if event_type in {"book", "book_without_event_type"}:
|
|
token_id = str(item.get("asset_id") or "")
|
|
state = state_by_token.get(token_id)
|
|
if state is None:
|
|
unsupported_counts["book_for_untracked_token"] = unsupported_counts.get("book_for_untracked_token", 0) + 1
|
|
continue
|
|
apply_book_item(state, item, received_at_utc)
|
|
elif event_type == "price_change":
|
|
apply_price_change(state_by_token, item, received_at_utc, warnings)
|
|
elif event_type == "best_bid_ask":
|
|
token_id = str(item.get("asset_id") or "")
|
|
state = state_by_token.get(token_id)
|
|
if state:
|
|
state.best_bid_ask_count += 1
|
|
state.messages_skipped += 1
|
|
unsupported_counts[event_type] = unsupported_counts.get(event_type, 0) + 1
|
|
elif event_type == "last_trade_price":
|
|
token_id = str(item.get("asset_id") or "")
|
|
state = state_by_token.get(token_id)
|
|
if state:
|
|
state.last_trade_price_count += 1
|
|
state.messages_skipped += 1
|
|
unsupported_counts[event_type] = unsupported_counts.get(event_type, 0) + 1
|
|
else:
|
|
unsupported_counts[event_type] = unsupported_counts.get(event_type, 0) + 1
|
|
for state in state_by_token.values():
|
|
state.unknown_messages += 1
|
|
|
|
|
|
def top_levels(book: dict[str, Decimal], side: str, top_n: int) -> list[tuple[str, Decimal]]:
|
|
reverse = side == "bids"
|
|
return sorted(book.items(), key=lambda item: Decimal(item[0]), reverse=reverse)[:top_n]
|
|
|
|
|
|
def rest_book_from_item(item: dict[str, Any]) -> dict[str, Any]:
|
|
bids = level_map(item.get("bids"))
|
|
asks = level_map(item.get("asks"))
|
|
return {"bids": bids, "asks": asks}
|
|
|
|
|
|
def book_summary_from_maps(bids: dict[str, Decimal], asks: dict[str, Decimal], top_n: int) -> dict[str, Any]:
|
|
bid_levels = top_levels(bids, "bids", top_n)
|
|
ask_levels = top_levels(asks, "asks", top_n)
|
|
best_bid = bid_levels[0][0] if bid_levels else None
|
|
best_ask = ask_levels[0][0] if ask_levels else None
|
|
spread = dec_to_str(Decimal(best_ask) - Decimal(best_bid)) if best_bid and best_ask else None
|
|
return {
|
|
"best_bid": best_bid,
|
|
"best_ask": best_ask,
|
|
"spread": spread,
|
|
"bid_level_count": len(bids),
|
|
"ask_level_count": len(asks),
|
|
"top_bids": [{"price": price, "size": dec_to_str(size)} for price, size in bid_levels],
|
|
"top_asks": [{"price": price, "size": dec_to_str(size)} for price, size in ask_levels],
|
|
}
|
|
|
|
|
|
def compare_side(local: dict[str, Decimal], rest: dict[str, Decimal], side: str, top_n: int) -> dict[str, Any]:
|
|
local_top = dict(top_levels(local, side, top_n))
|
|
rest_top = dict(top_levels(rest, side, top_n))
|
|
missing = sorted(set(rest_top) - set(local_top), key=Decimal, reverse=(side == "bids"))
|
|
extra = sorted(set(local_top) - set(rest_top), key=Decimal, reverse=(side == "bids"))
|
|
size_deltas = []
|
|
for price in sorted(set(local_top) & set(rest_top), key=Decimal, reverse=(side == "bids")):
|
|
delta = local_top[price] - rest_top[price]
|
|
if delta != 0:
|
|
size_deltas.append({"price": price, "local_size": dec_to_str(local_top[price]), "rest_size": dec_to_str(rest_top[price]), "delta": dec_to_str(delta)})
|
|
return {"missing_prices": missing, "extra_prices": extra, "size_deltas": size_deltas}
|
|
|
|
|
|
def compare_books(state: BookState, rest_item: dict[str, Any], top_n: int) -> dict[str, Any]:
|
|
rest_maps = rest_book_from_item(rest_item)
|
|
local_summary = book_summary_from_maps(state.bids, state.asks, top_n)
|
|
rest_summary = book_summary_from_maps(rest_maps["bids"], rest_maps["asks"], top_n)
|
|
bid_cmp = compare_side(state.bids, rest_maps["bids"], "bids", top_n)
|
|
ask_cmp = compare_side(state.asks, rest_maps["asks"], "asks", top_n)
|
|
best_match = local_summary["best_bid"] == rest_summary["best_bid"] and local_summary["best_ask"] == rest_summary["best_ask"]
|
|
top_match = not bid_cmp["missing_prices"] and not bid_cmp["extra_prices"] and not bid_cmp["size_deltas"] and not ask_cmp["missing_prices"] and not ask_cmp["extra_prices"] and not ask_cmp["size_deltas"]
|
|
return {
|
|
"comparison_status": "match" if best_match and top_match else "divergent",
|
|
"best_bid_match": local_summary["best_bid"] == rest_summary["best_bid"],
|
|
"best_ask_match": local_summary["best_ask"] == rest_summary["best_ask"],
|
|
"spread_match": local_summary["spread"] == rest_summary["spread"],
|
|
"level_count_match": local_summary["bid_level_count"] == rest_summary["bid_level_count"] and local_summary["ask_level_count"] == rest_summary["ask_level_count"],
|
|
"local": local_summary,
|
|
"rest": rest_summary,
|
|
"bid_top_n_diff": bid_cmp,
|
|
"ask_top_n_diff": ask_cmp,
|
|
}
|
|
|
|
|
|
def read_gzip_jsonl(path: Path) -> list[tuple[int, dict[str, Any]]]:
|
|
rows: list[tuple[int, dict[str, Any]]] = []
|
|
with gzip.open(path, "rt", encoding="utf-8") as handle:
|
|
for line_number, line in enumerate(handle, 1):
|
|
if line.strip():
|
|
rows.append((line_number, json.loads(line)))
|
|
return rows
|
|
|
|
|
|
def write_gzip_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
with gzip.open(path, "wt", encoding="utf-8") as handle:
|
|
for row in rows:
|
|
handle.write(json.dumps(row, separators=(",", ":"), sort_keys=True) + "\n")
|
|
|
|
|
|
def summarize_file(path: Path, rows: int, kind: str) -> dict[str, Any]:
|
|
return {"path": path.as_posix(), "kind": kind, "bytes": path.stat().st_size, "rows": rows, "sha256": sha256_file(path), "status": "valid" if path.exists() and path.stat().st_size else "missing_or_empty"}
|
|
|
|
|
|
def write_schema_docs(path: Path, schema_summary: dict[str, Any]) -> None:
|
|
lines = [
|
|
"# Polymarket Websocket Schema Observed In Checkpoint 10B",
|
|
"",
|
|
"This document summarizes observed public market websocket message shapes from the bounded 10B BTC sample. It does not include full raw payload dumps; raw payloads remain in the gzip JSONL sample files.",
|
|
"",
|
|
"## Observed Event Types",
|
|
"",
|
|
]
|
|
for event_type, info in sorted(schema_summary.items()):
|
|
lines.extend([
|
|
f"### {event_type}",
|
|
"",
|
|
f"Count: `{info['count']}`",
|
|
"",
|
|
f"Observed top-level fields: `{', '.join(info['fields'])}`",
|
|
"",
|
|
])
|
|
if info.get("level_fields"):
|
|
lines.extend([f"Nested level/change fields: `{', '.join(info['level_fields'])}`", ""])
|
|
lines.append(info.get("notes") or "No additional notes.")
|
|
lines.append("")
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text("\n".join(lines), encoding="utf-8")
|
|
|
|
|
|
def write_reconstruction_doc(path: Path) -> None:
|
|
lines = [
|
|
"# Book Reconstruction Method",
|
|
"",
|
|
"Checkpoint 10C reconstructs order-book state from raw Polymarket market websocket messages captured in Checkpoint 10B.",
|
|
"",
|
|
"## Source Of Truth",
|
|
"",
|
|
"Raw websocket and REST checkpoint gzip JSONL files are immutable source evidence. Reconstruction outputs are derived and reference the input file paths, line numbers, websocket message sequence spans, and REST checkpoint sequences.",
|
|
"",
|
|
"## Applied Events",
|
|
"",
|
|
"- `book` and `book_without_event_type` messages initialize or replace the full per-token bid/ask maps.",
|
|
"- `price_change` messages are applied after initialization. Observed `side=BUY` updates bids and `side=SELL` updates asks.",
|
|
"- Observed `size=0` is treated as level removal. Non-zero size replaces the level size at that price.",
|
|
"- `best_bid_ask`, `last_trade_price`, and unrelated `new_market` messages are preserved and counted but do not mutate the book map.",
|
|
"",
|
|
"## Comparison",
|
|
"",
|
|
"For each REST checkpoint, the reconstructor compares REST `/books` payloads with local websocket state after applying all websocket messages received at or before the REST checkpoint receive time. The comparison includes best bid, best ask, spread, bid/ask level counts, and top 10 levels by default.",
|
|
"",
|
|
"## Limits",
|
|
"",
|
|
"The sample is short and network timing can produce REST-vs-websocket divergences. Divergence rows include raw websocket and REST references so follow-up can inspect whether differences are timing, feed semantics, or reconstruction defects.",
|
|
"",
|
|
"## Checkpoint 10C Divergence Result",
|
|
"",
|
|
"The accepted 10C sample produced 20 REST comparison rows: 8 exact top-10 matches and 12 divergent rows. In every divergent row, best bid, best ask, spread, level counts, and top-N price membership matched. The observed divergences were size-only deltas within shared top-N price levels.",
|
|
"",
|
|
"Size-only divergence still matters. It can change depth, fillability assumptions, queue-size estimates, and any later answer about whether a hypothetical trade was observable and reproducible from the archived feed.",
|
|
"",
|
|
"This result is useful evidence for the websocket path, but it is not production readiness. The sample is bounded, the timing relationship between REST checkpoints and websocket delivery is imperfect, and long-running reconnect, stale-feed, rotation, upload, and alert behavior still need their own checkpoint before deployment.",
|
|
"",
|
|
]
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text("\n".join(lines), encoding="utf-8")
|
|
|
|
|
|
def write_report(path: Path, manifest: dict[str, Any]) -> None:
|
|
lines = [
|
|
"# Checkpoint 10C Book Reconstruction Sample",
|
|
"",
|
|
f"Status: {manifest['gate_status']} ",
|
|
f"Created: {manifest['ended_at_utc']} ",
|
|
"Production ready: no ",
|
|
"Live Kubernetes collector modified: no",
|
|
"",
|
|
"## Input",
|
|
"",
|
|
f"- 10B manifest: `{manifest['input']['manifest_path']}`.",
|
|
f"- 10B gate: `{manifest['input']['gate_status']}`.",
|
|
f"- Run id: `{manifest['run_id']}`.",
|
|
"",
|
|
"## Observed Websocket Events",
|
|
"",
|
|
f"Event type counts: `{json.dumps(manifest['event_type_counts'], sort_keys=True)}`.",
|
|
f"Unsupported/non-mutating counts: `{json.dumps(manifest['unsupported_event_counts'], sort_keys=True)}`.",
|
|
"",
|
|
"## Reconstruction Status",
|
|
"",
|
|
]
|
|
for token_id, status in manifest["token_reconstruction_statuses"].items():
|
|
meta = status.get("token", {})
|
|
lines.append(
|
|
f"- `{token_id}` ({meta.get('market_slug')} {meta.get('outcome')}): `{status['state_quality']}`, initialized `{status['initialized']}`, applied `{status['messages_applied']}`, skipped `{status['messages_skipped']}`, unknown `{status['unknown_messages']}`."
|
|
)
|
|
lines.extend([
|
|
"",
|
|
"## REST Comparison",
|
|
"",
|
|
f"- Comparison rows: `{manifest['comparison_summary']['comparison_count']}`.",
|
|
f"- Matches: `{manifest['comparison_summary']['match_count']}`.",
|
|
f"- Divergences: `{manifest['comparison_summary']['divergent_count']}`.",
|
|
f"- No-state rows: `{manifest['comparison_summary']['no_state_count']}`.",
|
|
"",
|
|
"Divergence samples are preserved in the machine-readable manifest with raw websocket and REST references.",
|
|
"",
|
|
"## Output Files",
|
|
"",
|
|
])
|
|
for output in manifest["output_files"]:
|
|
lines.append(f"- `{output['path']}`: `{output['kind']}`, rows `{output['rows']}`, bytes `{output['bytes']}`, sha256 `{output['sha256']}`")
|
|
lines.extend(["", "## Assumptions And Warnings", ""])
|
|
if manifest["warnings"]:
|
|
lines.extend(f"- {warning}" for warning in manifest["warnings"])
|
|
else:
|
|
lines.append("- None.")
|
|
lines.extend([
|
|
"",
|
|
"## Gate",
|
|
"",
|
|
manifest["gate_status"],
|
|
"",
|
|
"## Full-Fidelity Readiness Finding",
|
|
"",
|
|
manifest["readiness_finding"],
|
|
"",
|
|
"## Strongest Fake Progress Risk",
|
|
"",
|
|
"A reconstruction script can look correct while silently ignoring unsupported message semantics. This sample records unsupported event counts and comparison divergences with raw references so the next deployment step has audit evidence.",
|
|
"",
|
|
"## Next Smallest Step",
|
|
"",
|
|
"If combined 10B/10C passes, plan 10D: convert this proven sample path into a long-running Kubernetes websocket recorder with rotation, reconnect/stale-feed evidence, REST checkpoint recovery, upload cleanup, and migration from REST-only collection.",
|
|
"",
|
|
])
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text("\n".join(lines), encoding="utf-8")
|
|
|
|
|
|
def run_reconstruction(args: argparse.Namespace) -> dict[str, Any]:
|
|
started_at_utc = iso_z()
|
|
input_manifest_path = args.input_manifest
|
|
input_manifest = json.loads(input_manifest_path.read_text(encoding="utf-8"))
|
|
if input_manifest.get("gate_status") != "WS_RAW_SAMPLE_PASS":
|
|
raise RuntimeError("10C requires 10B gate_status WS_RAW_SAMPLE_PASS")
|
|
run_id = input_manifest["run_id"]
|
|
ws_file = Path(next(item["path"] for item in input_manifest["output_files"] if item["kind"] == "raw_websocket_messages"))
|
|
rest_file = Path(next(item["path"] for item in input_manifest["output_files"] if item["kind"] == "rest_books_checkpoints"))
|
|
ws_rows = read_gzip_jsonl(ws_file)
|
|
rest_rows = read_gzip_jsonl(rest_file)
|
|
token_meta = {token["token_id"]: token for token in input_manifest["tokens_tracked"]}
|
|
state_by_token = {token_id: BookState(meta) for token_id, meta in token_meta.items()}
|
|
event_type_counts: dict[str, int] = {}
|
|
unsupported_counts: dict[str, int] = {}
|
|
warnings: list[str] = ["Observed price_change side semantics are assumed as BUY->bids and SELL->asks.", "Observed size=0 is treated as level removal."]
|
|
schema_summary: dict[str, dict[str, Any]] = {}
|
|
|
|
def observe_schema(item: dict[str, Any], event_type: str) -> None:
|
|
info = schema_summary.setdefault(event_type, {"count": 0, "fields": set(), "level_fields": set(), "notes": ""})
|
|
info["count"] += 1
|
|
info["fields"].update(str(k) for k in item.keys())
|
|
for key in ("bids", "asks", "price_changes"):
|
|
values = item.get(key)
|
|
if isinstance(values, list):
|
|
for nested in values[:20]:
|
|
if isinstance(nested, dict):
|
|
info["level_fields"].update(str(k) for k in nested.keys())
|
|
notes = {
|
|
"book": "Full per-token book snapshot used to initialize or replace local state.",
|
|
"book_without_event_type": "Full per-token book snapshot without event_type; treated like book if observed.",
|
|
"price_change": "Incremental price/size updates applied after a token has an initialized book.",
|
|
"best_bid_ask": "Best quote summary; counted but not applied to level maps.",
|
|
"last_trade_price": "Trade print summary; counted but not applied to level maps.",
|
|
"new_market": "Market metadata broadcast; preserved and counted but unrelated to selected BTC token state in this sample.",
|
|
}
|
|
info["notes"] = notes.get(event_type, "Unsupported/unknown shape preserved and counted.")
|
|
|
|
for _line, row in ws_rows:
|
|
for item in raw_event_items(row):
|
|
observe_schema(item, classify_item(item))
|
|
for info in schema_summary.values():
|
|
info["fields"] = sorted(info["fields"])
|
|
info["level_fields"] = sorted(info["level_fields"])
|
|
|
|
comparisons: list[dict[str, Any]] = []
|
|
state_snapshots: list[dict[str, Any]] = []
|
|
ws_index = 0
|
|
first_applied_seq: int | None = None
|
|
last_applied_seq: int | None = None
|
|
first_applied_line: int | None = None
|
|
last_applied_line: int | None = None
|
|
last_applied_received_at_utc: str | None = None
|
|
rest_sorted = sorted(rest_rows, key=lambda item: parse_iso(item[1].get("received_at_utc")) or dt.datetime.min.replace(tzinfo=dt.UTC))
|
|
ws_sorted = sorted(ws_rows, key=lambda item: (parse_iso(item[1].get("received_at_utc")) or dt.datetime.min.replace(tzinfo=dt.UTC), item[1].get("global_message_sequence") or 0))
|
|
for rest_line, rest_row in rest_sorted:
|
|
rest_time = parse_iso(rest_row.get("received_at_utc"))
|
|
while ws_index < len(ws_sorted):
|
|
ws_line, ws_row = ws_sorted[ws_index]
|
|
ws_time = parse_iso(ws_row.get("received_at_utc"))
|
|
if rest_time is not None and ws_time is not None and ws_time > rest_time:
|
|
break
|
|
apply_ws_row(ws_row, state_by_token, event_type_counts, unsupported_counts, warnings)
|
|
seq = ws_row.get("global_message_sequence")
|
|
if isinstance(seq, int):
|
|
first_applied_seq = seq if first_applied_seq is None else min(first_applied_seq, seq)
|
|
last_applied_seq = seq if last_applied_seq is None else max(last_applied_seq, seq)
|
|
first_applied_line = ws_line if first_applied_line is None else min(first_applied_line, ws_line)
|
|
last_applied_line = ws_line
|
|
last_applied_received_at_utc = ws_row.get("received_at_utc")
|
|
ws_index += 1
|
|
rest_payload = (rest_row.get("response") or {}).get("raw_response_json")
|
|
if not isinstance(rest_payload, list):
|
|
warnings.append(f"REST checkpoint {rest_row.get('checkpoint_sequence')} payload was not a list")
|
|
continue
|
|
for rest_item in rest_payload:
|
|
if not isinstance(rest_item, dict):
|
|
continue
|
|
token_id = str(rest_item.get("asset_id") or "")
|
|
if token_id not in state_by_token:
|
|
continue
|
|
state = state_by_token[token_id]
|
|
base = {
|
|
"run_id": run_id,
|
|
"token_id": token_id,
|
|
"market": state.token_meta,
|
|
"rest_checkpoint_sequence": rest_row.get("checkpoint_sequence"),
|
|
"rest_checkpoint_received_at_utc": rest_row.get("received_at_utc"),
|
|
"rest_checkpoint_file": rest_file.as_posix(),
|
|
"rest_checkpoint_line": rest_line,
|
|
"raw_websocket_file": ws_file.as_posix(),
|
|
"applied_ws_message_count": ws_index,
|
|
"applied_ws_line_span": [first_applied_line, last_applied_line],
|
|
"applied_ws_global_sequence_span": [first_applied_seq, last_applied_seq],
|
|
"last_applied_ws_line": last_applied_line,
|
|
"last_applied_ws_received_at_utc": last_applied_received_at_utc,
|
|
"last_local_update_received_at_utc": state.last_update_received_at_utc,
|
|
"state_quality": state.state_quality(),
|
|
}
|
|
if not state.initialized:
|
|
comp = {**base, "comparison_status": "no_state", "reason": "token not initialized by preceding websocket book event"}
|
|
else:
|
|
comp = {**base, **compare_books(state, rest_item, args.top_n)}
|
|
comparisons.append(comp)
|
|
for token_id, state in state_by_token.items():
|
|
state_snapshots.append({
|
|
"run_id": run_id,
|
|
"snapshot_basis": "after_websocket_messages_preceding_rest_checkpoint",
|
|
"rest_checkpoint_sequence": rest_row.get("checkpoint_sequence"),
|
|
"rest_checkpoint_received_at_utc": rest_row.get("received_at_utc"),
|
|
"raw_websocket_file": ws_file.as_posix(),
|
|
"applied_ws_message_count": ws_index,
|
|
"applied_ws_line_span": [first_applied_line, last_applied_line],
|
|
"applied_ws_global_sequence_span": [first_applied_seq, last_applied_seq],
|
|
"last_applied_ws_line": last_applied_line,
|
|
"last_applied_ws_received_at_utc": last_applied_received_at_utc,
|
|
**state.clone_summary(args.top_n),
|
|
})
|
|
|
|
# Apply remaining websocket messages for final token statuses.
|
|
while ws_index < len(ws_sorted):
|
|
_ws_line, ws_row = ws_sorted[ws_index]
|
|
apply_ws_row(ws_row, state_by_token, event_type_counts, unsupported_counts, warnings)
|
|
ws_index += 1
|
|
|
|
output_dir = args.output_root / "polymarket" / "books" / run_id
|
|
comparison_dir = args.output_root / "polymarket" / "comparisons" / run_id
|
|
state_file = output_dir / f"polymarket_reconstructed_books_{run_id}.jsonl.gz"
|
|
comparison_file = comparison_dir / f"polymarket_rest_comparison_{run_id}.jsonl.gz"
|
|
write_gzip_jsonl(state_file, state_snapshots)
|
|
write_gzip_jsonl(comparison_file, comparisons)
|
|
|
|
statuses = {token_id: state.clone_summary(args.top_n) for token_id, state in state_by_token.items()}
|
|
market_token_init: dict[str, list[bool]] = {}
|
|
for state in state_by_token.values():
|
|
market_token_init.setdefault(str(state.token_meta.get("condition_id")), []).append(state.initialized)
|
|
any_market_both_initialized = any(len(values) >= 2 and all(values[:2]) for values in market_token_init.values())
|
|
match_count = sum(1 for row in comparisons if row.get("comparison_status") == "match")
|
|
divergent_count = sum(1 for row in comparisons if row.get("comparison_status") == "divergent")
|
|
no_state_count = sum(1 for row in comparisons if row.get("comparison_status") == "no_state")
|
|
if not any_market_both_initialized:
|
|
gate = "BLOCKED_INSUFFICIENT_WS_EVENTS"
|
|
elif not comparisons:
|
|
gate = "BLOCKED_REST_COMPARISON"
|
|
elif no_state_count == len(comparisons):
|
|
gate = "BLOCKED_INSUFFICIENT_WS_EVENTS"
|
|
else:
|
|
gate = "BOOK_RECONSTRUCTION_SAMPLE_PASS"
|
|
if divergent_count:
|
|
warnings.append("REST comparison divergences were observed and are preserved with raw references; timing differences are possible in this short live sample.")
|
|
|
|
readiness_finding = (
|
|
"The sample proves that observed websocket `book` snapshots can initialize local state and `price_change` messages can update it offline. REST comparisons executed with raw references; divergences require review before a live websocket recorder replaces REST-only collection."
|
|
if gate == "BOOK_RECONSTRUCTION_SAMPLE_PASS"
|
|
else "The sample did not prove enough websocket reconstruction behavior for a live recorder migration."
|
|
)
|
|
manifest = {
|
|
"schema_name": "checkpoint_010c_book_reconstruction_sample",
|
|
"schema_version": 1,
|
|
"checkpoint_id": "10C",
|
|
"checkpoint_name": "Offline Book Reconstruction And REST Comparison Sample",
|
|
"gate_status": gate,
|
|
"production_ready": False,
|
|
"live_kubernetes_collector_modified": False,
|
|
"reconstructor": {"name": RECONSTRUCTOR_NAME, "version": RECONSTRUCTOR_VERSION},
|
|
"started_at_utc": started_at_utc,
|
|
"ended_at_utc": iso_z(),
|
|
"run_id": run_id,
|
|
"top_n": args.top_n,
|
|
"input": {
|
|
"manifest_path": input_manifest_path.as_posix(),
|
|
"manifest_sha256": sha256_file(input_manifest_path),
|
|
"gate_status": input_manifest.get("gate_status"),
|
|
"raw_files": [
|
|
{"path": ws_file.as_posix(), "kind": "raw_websocket_messages", "sha256": sha256_file(ws_file), "expected_sha256": next(item["sha256"] for item in input_manifest["output_files"] if item["kind"] == "raw_websocket_messages")},
|
|
{"path": rest_file.as_posix(), "kind": "rest_books_checkpoints", "sha256": sha256_file(rest_file), "expected_sha256": next(item["sha256"] for item in input_manifest["output_files"] if item["kind"] == "rest_books_checkpoints")},
|
|
],
|
|
},
|
|
"output_files": [summarize_file(state_file, len(state_snapshots), "reconstructed_book_state_snapshots"), summarize_file(comparison_file, len(comparisons), "rest_comparison_rows")],
|
|
"event_type_counts": dict(sorted(event_type_counts.items())),
|
|
"observed_schema_summary": schema_summary,
|
|
"unsupported_event_counts": dict(sorted(unsupported_counts.items())),
|
|
"token_reconstruction_statuses": statuses,
|
|
"comparison_summary": {"comparison_count": len(comparisons), "match_count": match_count, "divergent_count": divergent_count, "no_state_count": no_state_count, "divergence_samples": [row for row in comparisons if row.get("comparison_status") == "divergent"][:10]},
|
|
"assumptions": ["BUY price_change updates bids; SELL price_change updates asks.", "size=0 removes a level; non-zero size replaces that price level.", "REST checkpoint comparison uses websocket state after messages received at or before REST checkpoint received_at_utc."],
|
|
"warnings": sorted(set(warnings)),
|
|
"readiness_finding": readiness_finding,
|
|
"strongest_fake_progress_risk": "Ignoring unsupported websocket events or REST divergences would overstate full-fidelity readiness.",
|
|
"next_step": "If combined 10B/10C passes, plan 10D long-running Kubernetes websocket recorder with rotation, reconnect/stale-feed evidence, REST checkpoint recovery, upload cleanup, and migration plan.",
|
|
}
|
|
args.manifest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
args.manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
|
write_schema_docs(args.schema_doc_path, schema_summary)
|
|
write_reconstruction_doc(args.reconstruction_doc_path)
|
|
write_report(args.report_path, manifest)
|
|
return manifest
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="Reconstruct Polymarket websocket books and compare to REST checkpoints.")
|
|
parser.add_argument("--input-manifest", type=Path, default=DEFAULT_INPUT_MANIFEST)
|
|
parser.add_argument("--output-root", type=Path, default=DEFAULT_OUTPUT_ROOT)
|
|
parser.add_argument("--manifest-path", type=Path, default=DEFAULT_MANIFEST_PATH)
|
|
parser.add_argument("--report-path", type=Path, default=DEFAULT_REPORT_PATH)
|
|
parser.add_argument("--schema-doc-path", type=Path, default=DEFAULT_SCHEMA_DOC)
|
|
parser.add_argument("--reconstruction-doc-path", type=Path, default=DEFAULT_RECON_DOC)
|
|
parser.add_argument("--top-n", type=int, default=10)
|
|
return parser.parse_args()
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
manifest = run_reconstruction(args)
|
|
print(f"RECONSTRUCTION_MANIFEST={args.manifest_path}")
|
|
print(f"RECONSTRUCTION_REPORT={args.report_path}")
|
|
print(f"RECONSTRUCTION_GATE={manifest['gate_status']}")
|
|
return 0 if manifest["gate_status"] in {"BOOK_RECONSTRUCTION_SAMPLE_PASS", "BOOK_RECONSTRUCTION_NEEDS_REVIEW"} else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|