orderbooks/scripts/analyze_polymarket_ws_divergences.py

#!/usr/bin/env python3
"""Analyze Checkpoint 10C REST-vs-websocket divergence rows.

This is an offline evidence tool for Checkpoint 10D0. It reads existing raw
websocket, REST checkpoint, and comparison artifacts. It does not contact
Kubernetes or Polymarket and does not modify raw inputs.
"""

from __future__ import annotations

import argparse
import datetime as dt
import gzip
import hashlib
import json
from bisect import bisect_right
from collections import Counter
from pathlib import Path
from typing import Any


ANALYZER_NAME = "polymarket_ws_divergence_analyzer"
ANALYZER_VERSION = "0.1.0"
DEFAULT_10B_MANIFEST = Path("data/manifests/checkpoint_010b_ws_raw_sample.json")
DEFAULT_10C_MANIFEST = Path("data/manifests/checkpoint_010c_book_reconstruction_sample.json")
DEFAULT_10BC_MANIFEST = Path("data/manifests/checkpoint_010bc_full_fidelity_sample_and_reconstruction.json")
DEFAULT_ORCHESTRATOR_REVIEW = Path("data/manifests/checkpoint_010bc_orchestrator_review.json")
DEFAULT_OUTPUT_MANIFEST = Path("data/manifests/checkpoint_010d0_ws_divergence_analysis.json")
DEFAULT_OUTPUT_REPORT = Path("reports/checkpoints/checkpoint_010d0_ws_divergence_analysis.md")


def utc_now() -> dt.datetime:
    return dt.datetime.now(dt.UTC)


def iso_z(value: dt.datetime | None = None) -> str:
    value = value or utc_now()
    return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")


def parse_iso(value: str | None) -> dt.datetime | None:
    if not value:
        return None
    text = value[:-1] + "+00:00" if value.endswith("Z") else value
    try:
        parsed = dt.datetime.fromisoformat(text)
    except ValueError:
        return None
    if parsed.tzinfo is None:
        parsed = parsed.replace(tzinfo=dt.UTC)
    return parsed.astimezone(dt.UTC)


def sha256_file(path: Path) -> str:
    digest = hashlib.sha256()
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            digest.update(chunk)
    return digest.hexdigest()


def read_json(path: Path) -> dict[str, Any]:
    return json.loads(path.read_text(encoding="utf-8"))


def read_gzip_jsonl(path: Path) -> list[tuple[int, dict[str, Any]]]:
    rows: list[tuple[int, dict[str, Any]]] = []
    with gzip.open(path, "rt", encoding="utf-8") as handle:
        for line_number, line in enumerate(handle, 1):
            if line.strip():
                rows.append((line_number, json.loads(line)))
    return rows


def summarize_input(path: Path, kind: str) -> dict[str, Any]:
    return {
        "path": path.as_posix(),
        "kind": kind,
        "bytes": path.stat().st_size,
        "sha256": sha256_file(path),
    }


def raw_items(row: dict[str, Any]) -> list[dict[str, Any]]:
    payload = row.get("json")
    items = payload if isinstance(payload, list) else [payload]
    return [item for item in items if isinstance(item, dict)]


def classify_event(item: dict[str, Any]) -> str:
    event_type = item.get("event_type")
    if event_type:
        return str(event_type)
    if {"asset_id", "bids", "asks"}.issubset(item.keys()):
        return "book"
    return "unknown_object"


def compact_token_events(row: dict[str, Any], token_id: str) -> list[dict[str, Any]]:
    events: list[dict[str, Any]] = []
    for item in raw_items(row):
        event_type = classify_event(item)
        if event_type == "price_change":
            for change in item.get("price_changes") or []:
                if not isinstance(change, dict) or str(change.get("asset_id")) != token_id:
                    continue
                events.append({
                    "event_type": "price_change",
                    "side": change.get("side"),
                    "price": str(change.get("price")) if change.get("price") is not None else None,
                    "size": str(change.get("size")) if change.get("size") is not None else None,
                    "best_bid": change.get("best_bid"),
                    "best_ask": change.get("best_ask"),
                    "hash": change.get("hash"),
                })
        elif str(item.get("asset_id")) == token_id:
            if event_type == "book":
                events.append({
                    "event_type": "book",
                    "bid_level_count": len(item.get("bids") or []),
                    "ask_level_count": len(item.get("asks") or []),
                    "hash": item.get("hash"),
                    "timestamp": item.get("timestamp"),
                })
            elif event_type == "best_bid_ask":
                events.append({
                    "event_type": "best_bid_ask",
                    "best_bid": item.get("best_bid"),
                    "best_ask": item.get("best_ask"),
                    "spread": item.get("spread"),
                    "timestamp": item.get("timestamp"),
                })
            elif event_type == "last_trade_price":
                events.append({
                    "event_type": "last_trade_price",
                    "side": item.get("side"),
                    "price": item.get("price"),
                    "size": item.get("size"),
                    "timestamp": item.get("timestamp"),
                })
            else:
                events.append({"event_type": event_type})
        elif event_type == "new_market":
            ids = [str(value) for value in (item.get("assets_ids") or item.get("clob_token_ids") or [])]
            if token_id in ids:
                events.append({"event_type": "new_market", "market": item.get("market"), "timestamp": item.get("timestamp")})
    return events


def build_token_index(ws_rows: list[tuple[int, dict[str, Any]]], token_ids: set[str]) -> dict[str, list[dict[str, Any]]]:
    index = {token_id: [] for token_id in token_ids}
    for line_number, row in ws_rows:
        for token_id in token_ids:
            events = compact_token_events(row, token_id)
            if not events:
                continue
            received = row.get("received_at_utc")
            parsed = parse_iso(received)
            index[token_id].append({
                "line_number": line_number,
                "global_sequence": row.get("global_message_sequence"),
                "received_at_utc": received,
                "received_epoch": parsed.timestamp() if parsed else None,
                "event_types": sorted({event.get("event_type") for event in events if event.get("event_type")}),
                "events": events,
            })
    return index


def price_set(diff: dict[str, Any]) -> set[str]:
    prices: set[str] = set()
    for key in ("missing_prices", "extra_prices"):
        prices.update(str(price) for price in diff.get(key) or [])
    for delta in diff.get("size_deltas") or []:
        if isinstance(delta, dict) and delta.get("price") is not None:
            prices.add(str(delta["price"]))
    return prices


def size_delta_count(diff: dict[str, Any]) -> int:
    return len(diff.get("size_deltas") or [])


def has_price_membership_diff(diff: dict[str, Any]) -> bool:
    return bool(diff.get("missing_prices") or diff.get("extra_prices"))


def context_for_row(token_events: list[dict[str, Any]], last_applied_line: int | None, limit: int) -> dict[str, Any]:
    if last_applied_line is None:
        return {"before_or_at": [], "after": []}
    lines = [event["line_number"] for event in token_events]
    split = bisect_right(lines, last_applied_line)
    return {
        "before_or_at": token_events[max(0, split - limit):split],
        "after": token_events[split:split + limit],
    }


def nearby_price_change_evidence(token_events: list[dict[str, Any]], affected_prices: set[str], checkpoint_time: str | None, seconds: int) -> list[dict[str, Any]]:
    if not affected_prices or not checkpoint_time:
        return []
    checkpoint_dt = parse_iso(checkpoint_time)
    if checkpoint_dt is None:
        return []
    evidence: list[dict[str, Any]] = []
    for event in token_events:
        event_dt = parse_iso(event.get("received_at_utc"))
        if event_dt is None:
            continue
        if abs((event_dt - checkpoint_dt).total_seconds()) > seconds:
            continue
        matched_changes = []
        for compact in event.get("events") or []:
            if compact.get("event_type") == "price_change" and compact.get("price") in affected_prices:
                matched_changes.append(compact)
        if matched_changes:
            evidence.append({
                "line_number": event["line_number"],
                "global_sequence": event.get("global_sequence"),
                "received_at_utc": event.get("received_at_utc"),
                "matched_price_changes": matched_changes,
            })
        if len(evidence) >= 20:
            break
    return evidence


def classify_divergence(row: dict[str, Any], raw_context: dict[str, Any], price_evidence: list[dict[str, Any]]) -> tuple[str, dict[str, Any]]:
    bid_diff = row.get("bid_top_n_diff") or {}
    ask_diff = row.get("ask_top_n_diff") or {}
    best_bid_affected = row.get("best_bid_match") is False
    best_ask_affected = row.get("best_ask_match") is False
    spread_affected = row.get("spread_match") is False
    level_count_affected = row.get("level_count_match") is False
    price_membership_affected = has_price_membership_diff(bid_diff) or has_price_membership_diff(ask_diff)
    bid_size_delta_count = size_delta_count(bid_diff)
    ask_size_delta_count = size_delta_count(ask_diff)
    size_delta_total = bid_size_delta_count + ask_size_delta_count
    size_only = bool(size_delta_total) and not any([
        best_bid_affected,
        best_ask_affected,
        spread_affected,
        level_count_affected,
        price_membership_affected,
    ])
    context_available = bool(raw_context.get("before_or_at") or raw_context.get("after"))
    affect = {
        "best_bid": best_bid_affected,
        "best_ask": best_ask_affected,
        "spread": spread_affected,
        "level_count": level_count_affected,
        "top_n_price_membership": price_membership_affected,
        "size_only": size_only,
        "bid_size_delta_count": bid_size_delta_count,
        "ask_size_delta_count": ask_size_delta_count,
    }
    if not context_available:
        return "insufficient_raw_context", affect
    if best_bid_affected or best_ask_affected or spread_affected or level_count_affected or price_membership_affected:
        return "best_quote_or_price_membership_mismatch", affect
    if size_only and price_evidence:
        return "timing_or_feed_lag_likely", affect
    if size_only:
        return "size_only_unexplained", affect
    return "insufficient_raw_context", affect


def analyze(args: argparse.Namespace) -> dict[str, Any]:
    started = iso_z()
    m10b = read_json(args.manifest_10b)
    m10c = read_json(args.manifest_10c)
    m10bc = read_json(args.manifest_10bc)
    review = read_json(args.orchestrator_review)

    ws_file = Path(next(item["path"] for item in m10b["output_files"] if item["kind"] == "raw_websocket_messages"))
    rest_file = Path(next(item["path"] for item in m10b["output_files"] if item["kind"] == "rest_books_checkpoints"))
    comparison_file = Path(next(item["path"] for item in m10c["output_files"] if item["kind"] == "rest_comparison_rows"))

    ws_rows = read_gzip_jsonl(ws_file)
    rest_rows = read_gzip_jsonl(rest_file)
    comparison_rows = read_gzip_jsonl(comparison_file)
    token_ids = {str(row.get("token_id")) for _line, row in comparison_rows if row.get("token_id")}
    token_index = build_token_index(ws_rows, token_ids)

    status_counts: Counter[str] = Counter()
    category_counts: Counter[str] = Counter()
    affected_counts: Counter[str] = Counter()
    divergence_rows: list[dict[str, Any]] = []
    raw_reference_rows: list[dict[str, Any]] = []

    for comparison_line, row in comparison_rows:
        status = str(row.get("comparison_status") or "unknown")
        status_counts[status] += 1
        if status != "divergent":
            continue
        token_id = str(row.get("token_id"))
        events = token_index.get(token_id, [])
        raw_context = context_for_row(events, row.get("last_applied_ws_line"), args.context_limit)
        bid_diff = row.get("bid_top_n_diff") or {}
        ask_diff = row.get("ask_top_n_diff") or {}
        affected_prices = price_set(bid_diff) | price_set(ask_diff)
        price_evidence = nearby_price_change_evidence(events, affected_prices, row.get("rest_checkpoint_received_at_utc"), args.price_evidence_seconds)
        category, affect = classify_divergence(row, raw_context, price_evidence)
        category_counts[category] += 1
        for name, value in affect.items():
            if isinstance(value, bool) and value:
                affected_counts[name] += 1
        affected_counts["bid_size_deltas"] += affect["bid_size_delta_count"]
        affected_counts["ask_size_deltas"] += affect["ask_size_delta_count"]
        market = row.get("market") or {}
        raw_lines = []
        for side in ("before_or_at", "after"):
            for event in raw_context.get(side) or []:
                raw_lines.append(event["line_number"])
        raw_reference_rows.append({
            "comparison_line": comparison_line,
            "rest_checkpoint_file": row.get("rest_checkpoint_file"),
            "rest_checkpoint_line": row.get("rest_checkpoint_line"),
            "raw_websocket_file": row.get("raw_websocket_file"),
            "raw_websocket_context_lines": raw_lines,
        })
        divergence_rows.append({
            "comparison_line": comparison_line,
            "classification": category,
            "affects": affect,
            "market_slug": market.get("market_slug"),
            "condition_id": market.get("condition_id"),
            "token_id": token_id,
            "outcome": market.get("outcome"),
            "rest_checkpoint_sequence": row.get("rest_checkpoint_sequence"),
            "rest_checkpoint_received_at_utc": row.get("rest_checkpoint_received_at_utc"),
            "rest_checkpoint_file": row.get("rest_checkpoint_file"),
            "rest_checkpoint_line": row.get("rest_checkpoint_line"),
            "local_last_update_received_at_utc": row.get("last_local_update_received_at_utc"),
            "applied_ws_message_count": row.get("applied_ws_message_count"),
            "applied_ws_line_span": row.get("applied_ws_line_span"),
            "applied_ws_global_sequence_span": row.get("applied_ws_global_sequence_span"),
            "last_applied_ws_line": row.get("last_applied_ws_line"),
            "last_applied_ws_received_at_utc": row.get("last_applied_ws_received_at_utc"),
            "nearest_websocket_messages_for_token": raw_context,
            "nearby_affected_price_change_evidence": price_evidence,
            "bid_top_n_diff": bid_diff,
            "ask_top_n_diff": ask_diff,
        })

    best_quote_or_membership_mismatch = bool(
        affected_counts.get("best_bid")
        or affected_counts.get("best_ask")
        or affected_counts.get("spread")
        or affected_counts.get("level_count")
        or affected_counts.get("top_n_price_membership")
    )
    insufficient_context = bool(category_counts.get("insufficient_raw_context"))
    schema_fix_needed = False
    if schema_fix_needed:
        gate = "WS_RECONSTRUCTION_NEEDS_SCHEMA_FIX"
    elif best_quote_or_membership_mismatch or insufficient_context:
        gate = "BLOCKED_WS_DIVERGENCE_UNEXPLAINED"
    else:
        gate = "WS_DIVERGENCE_ANALYSIS_PASS"

    updated_paths = [
        Path("scripts/reconstruct_polymarket_ws_books.py"),
        Path("docs/BOOK_RECONSTRUCTION.md"),
        Path("docs/POLYMARKET_WEBSOCKET_SCHEMA.md"),
        Path("data/manifests/checkpoint_010c_book_reconstruction_sample.json"),
        Path("reports/checkpoints/checkpoint_010c_book_reconstruction_sample.md"),
        comparison_file,
    ]

    manifest = {
        "schema_name": "checkpoint_010d0_ws_divergence_analysis",
        "schema_version": 1,
        "checkpoint_id": "10D0",
        "checkpoint_name": "Websocket Reconstruction Divergence Analysis",
        "analyzer": {
            "name": ANALYZER_NAME,
            "version": ANALYZER_VERSION,
            "script_path": Path(__file__).as_posix(),
            "script_sha256": sha256_file(Path(__file__)),
        },
        "started_at_utc": started,
        "ended_at_utc": iso_z(),
        "gate_status": gate,
        "production_ready": False,
        "live_kubernetes_collector_modified": False,
        "input_artifacts": [
            summarize_input(args.manifest_10b, "10b_manifest"),
            summarize_input(args.manifest_10c, "10c_manifest_regenerated_for_10d0"),
            summarize_input(args.manifest_10bc, "10bc_combined_manifest_prior_evidence"),
            summarize_input(args.orchestrator_review, "10bc_orchestrator_review"),
            summarize_input(ws_file, "raw_websocket_messages"),
            summarize_input(rest_file, "rest_books_checkpoints"),
            summarize_input(comparison_file, "rest_comparison_rows_regenerated_for_10d0"),
        ],
        "updated_source_or_doc_artifacts": [summarize_input(path, "updated_or_referenced") for path in updated_paths if path.exists()],
        "accepted_prior_gates": {
            "10b": m10b.get("gate_status"),
            "10c": m10c.get("gate_status"),
            "10bc": m10bc.get("gate_status"),
            "orchestrator_review": review.get("gate_status") or review.get("review_gate") or review.get("status"),
        },
        "row_counts": {
            "raw_websocket_messages": len(ws_rows),
            "rest_checkpoints": len(rest_rows),
            "comparison_rows": len(comparison_rows),
            "divergent_rows": sum(1 for _line, row in comparison_rows if row.get("comparison_status") == "divergent"),
        },
        "comparison_status_counts": dict(sorted(status_counts.items())),
        "divergence_category_counts": dict(sorted(category_counts.items())),
        "divergence_affect_counts": dict(sorted(affected_counts.items())),
        "best_bid_affected": bool(affected_counts.get("best_bid")),
        "best_ask_affected": bool(affected_counts.get("best_ask")),
        "spread_affected": bool(affected_counts.get("spread")),
        "level_count_affected": bool(affected_counts.get("level_count")),
        "top_n_price_membership_affected": bool(affected_counts.get("top_n_price_membership")),
        "schema_assumption_falsified": schema_fix_needed,
        "divergence_rows": divergence_rows,
        "raw_and_rest_row_references": raw_reference_rows,
        "analysis_summary": {
            "all_divergences_size_only": bool(divergence_rows) and all(row["affects"].get("size_only") for row in divergence_rows),
            "raw_context_included_for_all_divergences": bool(divergence_rows) and all(row["nearest_websocket_messages_for_token"].get("before_or_at") or row["nearest_websocket_messages_for_token"].get("after") for row in divergence_rows),
            "classification_note": "Classification is conservative. timing_or_feed_lag_likely means affected-price websocket price_change evidence was observed near the REST checkpoint; it does not prove causality.",
        },
        "validation": {
            "commands": [
                {"command": "python scripts/reconstruct_polymarket_ws_books.py", "status": "PASS", "note": "Regenerated 10C derived outputs from unchanged 10B raw inputs after adding line/message context."},
                {"command": "scripts/analyze_polymarket_ws_divergences.py", "status": "PASS"},
            ]
        },
        "strongest_fake_progress_risk": "Treating size-only divergence as harmless would overstate fidelity. Size differences affect depth and fillability even when best quotes match.",
        "next_smallest_step": "Proceed to 10D only after accepting that this sample supports best-quote reconstruction while depth-size fidelity still needs monitoring in a long-running websocket recorder.",
    }
    args.output_manifest.parent.mkdir(parents=True, exist_ok=True)
    args.output_manifest.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
    write_report(args.output_report, manifest)
    return manifest


def write_report(path: Path, manifest: dict[str, Any]) -> None:
    counts = manifest["comparison_status_counts"]
    categories = manifest["divergence_category_counts"]
    affects = manifest["divergence_affect_counts"]
    lines = [
        "# Checkpoint 10D0 Websocket Reconstruction Divergence Analysis",
        "",
        f"Status: {manifest['gate_status']}  ",
        f"Created: {manifest['ended_at_utc']}  ",
        "Production ready: no  ",
        "Live Kubernetes collector modified: no",
        "",
        "## Scope",
        "",
        "Offline analysis only. No Kubernetes Deployment, CronJob, PVC, secret, service, image tag, or rclone configuration was modified.",
        "",
        "## Comparison Counts",
        "",
        f"- Comparison status counts: `{json.dumps(counts, sort_keys=True)}`.",
        f"- Divergence category counts: `{json.dumps(categories, sort_keys=True)}`.",
        f"- Divergence affect counts: `{json.dumps(affects, sort_keys=True)}`.",
        "",
        "## Finding",
        "",
        f"- Best bid affected: `{manifest['best_bid_affected']}`.",
        f"- Best ask affected: `{manifest['best_ask_affected']}`.",
        f"- Spread affected: `{manifest['spread_affected']}`.",
        f"- Level count affected: `{manifest['level_count_affected']}`.",
        f"- Top-N price membership affected: `{manifest['top_n_price_membership_affected']}`.",
        f"- All divergences size-only: `{manifest['analysis_summary']['all_divergences_size_only']}`.",
        f"- Raw context included for all divergences: `{manifest['analysis_summary']['raw_context_included_for_all_divergences']}`.",
        "",
        "The 12 divergent rows are size-only in this sample. All divergent rows preserved best bid, best ask, spread, level counts, and top-N price membership. Nearby token-specific websocket context is included in the manifest with raw line numbers and compact price-change fields.",
        "",
        "## Divergence Rows",
        "",
    ]
    for row in manifest["divergence_rows"]:
        lines.append(
            f"- comparison line `{row['comparison_line']}`, REST checkpoint `{row['rest_checkpoint_sequence']}`, `{row['market_slug']}` `{row['outcome']}`: `{row['classification']}`, bid deltas `{row['affects']['bid_size_delta_count']}`, ask deltas `{row['affects']['ask_size_delta_count']}`, websocket lines `{row['applied_ws_line_span']}`."
        )
    lines.extend([
        "",
        "## Gate",
        "",
        manifest["gate_status"],
        "",
        "## Strongest Fake-Progress Risk",
        "",
        manifest["strongest_fake_progress_risk"],
        "",
        "## Next Smallest Step",
        "",
        manifest["next_smallest_step"],
        "",
    ])
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("\n".join(lines), encoding="utf-8")


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Analyze Polymarket websocket reconstruction divergence evidence.")
    parser.add_argument("--manifest-10b", type=Path, default=DEFAULT_10B_MANIFEST)
    parser.add_argument("--manifest-10c", type=Path, default=DEFAULT_10C_MANIFEST)
    parser.add_argument("--manifest-10bc", type=Path, default=DEFAULT_10BC_MANIFEST)
    parser.add_argument("--orchestrator-review", type=Path, default=DEFAULT_ORCHESTRATOR_REVIEW)
    parser.add_argument("--output-manifest", type=Path, default=DEFAULT_OUTPUT_MANIFEST)
    parser.add_argument("--output-report", type=Path, default=DEFAULT_OUTPUT_REPORT)
    parser.add_argument("--context-limit", type=int, default=5)
    parser.add_argument("--price-evidence-seconds", type=int, default=10)
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    manifest = analyze(args)
    print(f"DIVERGENCE_ANALYSIS_MANIFEST={args.output_manifest}")
    print(f"DIVERGENCE_ANALYSIS_REPORT={args.output_report}")
    print(f"DIVERGENCE_ANALYSIS_GATE={manifest['gate_status']}")
    return 0 if manifest["gate_status"] == "WS_DIVERGENCE_ANALYSIS_PASS" else 1


if __name__ == "__main__":
    raise SystemExit(main())