orderbooks/scripts/discover_polymarket_btc_markets.py
philipp 284e465588
Some checks failed
deploy / deploy (push) Has been cancelled
Prepare Kubernetes orderbooks deployment
2026-04-18 11:23:28 +02:00

752 lines
27 KiB
Python
Executable file

#!/usr/bin/env python3
"""Discover active Polymarket BTC up/down markets.
Checkpoint 3 scope: fetch bounded public Gamma metadata, preserve raw responses,
and write normalized market records with outcome-token mappings. This is not an
order-book collector.
"""
from __future__ import annotations
import argparse
import datetime as dt
import hashlib
import json
import sys
import time
import urllib.error
import urllib.parse
import urllib.request
from pathlib import Path
from typing import Any
GAMMA_EVENTS_URL = "https://gamma-api.polymarket.com/events"
BTC_TAG_ID = 235
DEFAULT_OUTPUT_JSON = Path("data/discovery/polymarket_btc_markets_latest.json")
DEFAULT_MANIFEST = Path("data/discovery/polymarket_btc_markets_manifest.json")
DEFAULT_MARKDOWN = Path("data/discovery/polymarket_btc_markets.md")
SAFE_RESPONSE_HEADERS = {
"age",
"cache-control",
"cf-cache-status",
"cf-ray",
"content-encoding",
"content-length",
"content-type",
"date",
"expires",
"last-modified",
"ratelimit-limit",
"ratelimit-remaining",
"ratelimit-reset",
"retry-after",
"server",
"strict-transport-security",
"x-ratelimit-limit",
"x-ratelimit-remaining",
"x-ratelimit-reset",
}
FILTER_RULES = [
"Use public Gamma /events with tag_id=235, related_tags=true, active=true, closed=false.",
"Require event.active=true and event.closed=false.",
"Require market.active=true and market.closed=false.",
"Require market.enableOrderBook=true.",
"Require market.acceptingOrders=true unless --allow-non-accepting-orders is used.",
"Require market end time to be after the fetch time unless --allow-expired is used.",
"Require outcomes to resolve to exactly Up and Down.",
"Require clobTokenIds to resolve to exactly two token IDs.",
"Require BTC/up-down evidence from seriesSlug, title/slug text, or tags.",
]
def utc_now() -> dt.datetime:
return dt.datetime.now(dt.UTC)
def iso_z(value: dt.datetime | None = None) -> str:
value = value or utc_now()
return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
def parse_iso(value: Any) -> dt.datetime | None:
if not isinstance(value, str) or not value.strip():
return None
text = value.strip()
if text.endswith("Z"):
text = text[:-1] + "+00:00"
try:
parsed = dt.datetime.fromisoformat(text)
except ValueError:
return None
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=dt.UTC)
return parsed.astimezone(dt.UTC)
def sha256_file(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def filter_headers(headers: Any) -> dict[str, str]:
safe: dict[str, str] = {}
for key, value in dict(headers).items():
if key.lower() in SAFE_RESPONSE_HEADERS:
safe[key] = value
return safe
def normalize_params(params: dict[str, Any]) -> dict[str, Any]:
normalized: dict[str, Any] = {}
for key, value in params.items():
if isinstance(value, bool):
normalized[key] = "true" if value else "false"
else:
normalized[key] = value
return normalized
def build_url(url: str, params: dict[str, Any]) -> str:
query = urllib.parse.urlencode(normalize_params(params), doseq=True)
return f"{url}?{query}"
def fetch_json_page(
*,
name: str,
url: str,
params: dict[str, Any],
timeout_seconds: float,
) -> dict[str, Any]:
started_monotonic = time.monotonic()
started_at_utc = iso_z()
full_url = build_url(url, params)
request = urllib.request.Request(
full_url,
headers={
"Accept": "application/json",
"User-Agent": "orderbooks-checkpoint-3-discovery/1.0",
},
method="GET",
)
status_code: int | None = None
response_headers: dict[str, str] = {}
response_text = ""
error: str | None = None
try:
with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
status_code = response.status
response_headers = filter_headers(response.headers)
response_text = response.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as exc:
status_code = exc.code
response_headers = filter_headers(exc.headers)
response_text = exc.read().decode("utf-8", errors="replace")
error = f"HTTPError: {exc}"
except Exception as exc: # noqa: BLE001 - preserve probe failure evidence
error = f"{type(exc).__name__}: {exc}"
response_json: Any | None = None
json_error: str | None = None
if response_text:
try:
response_json = json.loads(response_text)
except json.JSONDecodeError as exc:
json_error = str(exc)
return {
"name": name,
"started_at_utc": started_at_utc,
"ended_at_utc": iso_z(),
"duration_ms": round((time.monotonic() - started_monotonic) * 1000, 3),
"request": {
"method": "GET",
"url": url,
"full_url": full_url,
"params": normalize_params(params),
},
"response": {
"status_code": status_code,
"headers": response_headers,
"json": response_json,
"json_error": json_error,
"text_preview": response_text[:1000] if response_json is None else None,
},
"ok": error is None and status_code is not None and 200 <= status_code < 300,
"error": error,
}
def coerce_json_array(value: Any) -> list[Any]:
if isinstance(value, list):
return value
if isinstance(value, str):
try:
parsed = json.loads(value)
except json.JSONDecodeError:
return []
return parsed if isinstance(parsed, list) else []
return []
def lower_text(value: Any) -> str:
return str(value or "").lower()
def event_tag_text(event: dict[str, Any]) -> str:
parts: list[str] = []
for tag in event.get("tags") or []:
if isinstance(tag, dict):
parts.append(str(tag.get("slug") or ""))
parts.append(str(tag.get("label") or ""))
return " ".join(parts).lower()
def has_btc_up_down_evidence(event: dict[str, Any], market: dict[str, Any]) -> bool:
series_slug = lower_text(event.get("seriesSlug"))
text = " ".join(
lower_text(event.get(key))
for key in ("title", "slug", "ticker", "description")
)
text += " " + " ".join(
lower_text(market.get(key))
for key in ("question", "slug", "description")
)
tags = event_tag_text(event)
series_match = series_slug.startswith("btc-up-or-down")
text_match = ("bitcoin" in text or "btc" in text) and "up" in text and "down" in text
tag_match = ("bitcoin" in tags or "btc" in tags) and "up-or-down" in tags
return bool(series_match or text_match or tag_match)
def is_up_down_outcomes(outcomes: list[str]) -> bool:
return len(outcomes) == 2 and {item.lower() for item in outcomes} == {"up", "down"}
def normalize_market(
*,
event: dict[str, Any],
market: dict[str, Any],
page_index: int,
event_index: int,
market_index: int,
fetched_at_utc: str,
output_json_path: Path,
) -> dict[str, Any]:
outcomes = [str(item) for item in coerce_json_array(market.get("outcomes"))]
token_ids = [str(item) for item in coerce_json_array(market.get("clobTokenIds"))]
tokens = [
{
"outcome": outcomes[index],
"token_id": token_ids[index],
"outcome_index": index,
}
for index in range(min(len(outcomes), len(token_ids)))
]
start_time = (
market.get("startDate")
or market.get("startDateIso")
or event.get("startDate")
or event.get("creationDate")
)
end_time = market.get("endDate") or market.get("endDateIso") or event.get("endDate")
event_slug = event.get("slug")
market_slug = market.get("slug") or event_slug
return {
"market_name": "polymarket",
"market_slug": market_slug,
"event_slug": event_slug,
"title": event.get("title") or market.get("question"),
"question": market.get("question") or event.get("title"),
"condition_id": market.get("conditionId"),
"tokens": tokens,
"outcomes": outcomes,
"start_time_utc": iso_z(parse_iso(start_time)) if parse_iso(start_time) else start_time,
"end_time_utc": iso_z(parse_iso(end_time)) if parse_iso(end_time) else end_time,
"active": market.get("active"),
"closed": market.get("closed"),
"event_active": event.get("active"),
"event_closed": event.get("closed"),
"accepting_orders": market.get("acceptingOrders"),
"enable_order_book": market.get("enableOrderBook"),
"endpoint_source": {
"name": "gamma_events_bitcoin_tag",
"method": "GET",
"url": GAMMA_EVENTS_URL,
"params_basis": {
"tag_id": BTC_TAG_ID,
"related_tags": "true",
"active": "true",
"closed": "false",
"order": "endDate",
"ascending": "true",
},
},
"fetched_at_utc": fetched_at_utc,
"raw_ref": {
"artifact_path": output_json_path.as_posix(),
"section": "raw.gamma_events_pages",
"page_index": page_index,
"event_index": event_index,
"market_index": market_index,
"json_path": f"raw.gamma_events_pages[{page_index}].response.json[{event_index}].markets[{market_index}]",
},
}
def rejection_reasons(
*,
event: dict[str, Any],
market: dict[str, Any],
fetched_at: dt.datetime,
require_accepting_orders: bool,
require_future_end: bool,
) -> list[str]:
reasons: list[str] = []
outcomes = [str(item) for item in coerce_json_array(market.get("outcomes"))]
token_ids = [str(item) for item in coerce_json_array(market.get("clobTokenIds"))]
end_time = parse_iso(market.get("endDate") or event.get("endDate"))
if event.get("active") is not True:
reasons.append("event_not_active")
if event.get("closed") is not False:
reasons.append("event_closed")
if market.get("active") is not True:
reasons.append("market_not_active")
if market.get("closed") is not False:
reasons.append("market_closed")
if market.get("enableOrderBook") is not True:
reasons.append("order_book_not_enabled")
if require_accepting_orders and market.get("acceptingOrders") is not True:
reasons.append("not_accepting_orders")
if require_future_end and (end_time is None or end_time <= fetched_at):
reasons.append("not_future_end")
if not is_up_down_outcomes(outcomes):
reasons.append("not_up_down_outcomes")
if len(token_ids) != 2:
reasons.append("missing_two_clob_token_ids")
if not has_btc_up_down_evidence(event, market):
reasons.append("missing_btc_up_down_evidence")
return reasons
def discover(args: argparse.Namespace) -> dict[str, Any]:
started_at_utc = iso_z()
fetched_at = utc_now()
fetched_at_utc = iso_z(fetched_at)
raw_pages: list[dict[str, Any]] = []
normalized: list[dict[str, Any]] = []
rejected_counts: dict[str, int] = {}
warnings: list[str] = []
seen_conditions: set[str] = set()
for page_index in range(args.max_pages):
offset = page_index * args.limit
params = {
"tag_id": BTC_TAG_ID,
"related_tags": True,
"active": True,
"closed": False,
"limit": args.limit,
"offset": offset,
"order": "endDate",
"ascending": True,
}
page = fetch_json_page(
name=f"gamma_events_bitcoin_tag_page_{page_index}",
url=GAMMA_EVENTS_URL,
params=params,
timeout_seconds=args.timeout,
)
raw_pages.append(page)
payload = page["response"]["json"]
if not page["ok"]:
warnings.append(
f"Page {page_index} request failed with status {page['response']['status_code']}: {page['error']}"
)
break
if not isinstance(payload, list):
warnings.append(f"Page {page_index} response was not a JSON list.")
break
for event_index, event in enumerate(payload):
if not isinstance(event, dict):
rejected_counts["event_not_object"] = rejected_counts.get("event_not_object", 0) + 1
continue
markets = event.get("markets") or []
if not isinstance(markets, list) or not markets:
rejected_counts["missing_markets"] = rejected_counts.get("missing_markets", 0) + 1
continue
for market_index, market in enumerate(markets):
if not isinstance(market, dict):
rejected_counts["market_not_object"] = rejected_counts.get("market_not_object", 0) + 1
continue
reasons = rejection_reasons(
event=event,
market=market,
fetched_at=fetched_at,
require_accepting_orders=not args.allow_non_accepting_orders,
require_future_end=not args.allow_expired,
)
if reasons:
for reason in reasons:
rejected_counts[reason] = rejected_counts.get(reason, 0) + 1
continue
condition_id = str(market.get("conditionId") or "")
if condition_id in seen_conditions:
rejected_counts["duplicate_condition_id"] = rejected_counts.get(
"duplicate_condition_id", 0
) + 1
continue
seen_conditions.add(condition_id)
normalized.append(
normalize_market(
event=event,
market=market,
page_index=page_index,
event_index=event_index,
market_index=market_index,
fetched_at_utc=fetched_at_utc,
output_json_path=args.output_json,
)
)
if len(payload) < args.limit:
break
normalized.sort(key=lambda item: (item.get("end_time_utc") or "", item.get("market_slug") or ""))
if raw_pages:
last_payload = raw_pages[-1]["response"].get("json")
if isinstance(last_payload, list) and len(last_payload) == args.limit and len(raw_pages) >= args.max_pages:
warnings.append(
"Discovery stopped at max_pages before exhausting Gamma pagination; output is bounded to the fetched pages."
)
if len(normalized) < args.min_markets:
warnings.append(
f"Only {len(normalized)} markets passed filters; min_markets={args.min_markets}."
)
status = "PASS" if len(normalized) >= args.min_markets else "FAIL"
status_reason = (
f"Discovered {len(normalized)} active BTC up/down markets with condition IDs and two token IDs."
if status == "PASS"
else "Did not discover enough active BTC up/down markets with condition IDs and two token IDs."
)
return {
"schema_name": "polymarket_btc_market_discovery",
"schema_version": 1,
"artifact_status": "valid" if status == "PASS" else "partial",
"checkpoint_id": 3,
"checkpoint_name": "Minimal BTC Market Discovery",
"started_at_utc": started_at_utc,
"ended_at_utc": iso_z(),
"fetched_at_utc": fetched_at_utc,
"scope": "Bounded public Gamma metadata discovery only; no order-book collector.",
"endpoint_basis": {
"source_checkpoint": "Checkpoint 2",
"source_report": "reports/checkpoints/checkpoint_002_polymarket_public_sources.md",
"endpoint": GAMMA_EVENTS_URL,
"method": "GET",
"base_params": {
"tag_id": BTC_TAG_ID,
"related_tags": True,
"active": True,
"closed": False,
"limit": args.limit,
"order": "endDate",
"ascending": True,
},
},
"filter_rules": FILTER_RULES,
"normalized_markets": normalized,
"raw": {
"gamma_events_pages": raw_pages,
},
"summary": {
"status": status,
"status_reason": status_reason,
"raw_pages_fetched": len(raw_pages),
"raw_events_fetched": sum(
len(page["response"].get("json") or [])
for page in raw_pages
if isinstance(page["response"].get("json"), list)
),
"normalized_market_count": len(normalized),
"rejected_counts": dict(sorted(rejected_counts.items())),
"warnings": warnings,
},
"fake_progress_risk": "Discovery can appear successful while silently missing markets if filters rely on stale text assumptions or bounded pagination. Raw pages and rejection counts are preserved so missed-market risk can be audited.",
"next_step": "Checkpoint 4 should use this discovery output as input for a short, raw-first order-book snapshot sample; do not claim reliability until the later 24h soak test.",
}
def markdown_table_row(values: list[Any]) -> str:
return "| " + " | ".join(str(value).replace("\n", " ") for value in values) + " |"
def write_markdown(discovery: dict[str, Any], path: Path) -> None:
summary = discovery["summary"]
rows = discovery["normalized_markets"]
lines = [
"# Polymarket BTC Markets Discovery",
"",
f"Artifact status: `{discovery['artifact_status']}`",
"",
"## Gate",
"",
f"Status: `{summary['status']}`",
"",
summary["status_reason"],
"",
"## Scope",
"",
"Bounded public Gamma metadata discovery only. No order-book collection, no trading, no private endpoints, no secrets.",
"",
"## Endpoint",
"",
f"- `GET {GAMMA_EVENTS_URL}`",
"- Params: `tag_id=235`, `related_tags=true`, `active=true`, `closed=false`, `order=endDate`, `ascending=true`, bounded by `limit` and `max_pages`.",
"",
"## Summary",
"",
markdown_table_row(["Metric", "Value"]),
markdown_table_row(["---", "---"]),
markdown_table_row(["fetched_at_utc", discovery["fetched_at_utc"]]),
markdown_table_row(["raw_pages_fetched", summary["raw_pages_fetched"]]),
markdown_table_row(["raw_events_fetched", summary["raw_events_fetched"]]),
markdown_table_row(["normalized_market_count", summary["normalized_market_count"]]),
"",
"## Markets",
"",
markdown_table_row(
[
"market_slug",
"end_time_utc",
"condition_id",
"outcomes",
"token_ids",
"accepting_orders",
]
),
markdown_table_row(["---", "---", "---", "---", "---", "---"]),
]
for row in rows:
token_ids = [token["token_id"] for token in row["tokens"]]
lines.append(
markdown_table_row(
[
row.get("market_slug"),
row.get("end_time_utc"),
row.get("condition_id"),
json.dumps(row.get("outcomes")),
json.dumps(token_ids),
row.get("accepting_orders"),
]
)
)
lines.extend(
[
"",
"## Warnings",
"",
]
)
if summary["warnings"]:
for warning in summary["warnings"]:
lines.append(f"- {warning}")
else:
lines.append("- None.")
lines.extend(
[
"",
"## Rejection Counts",
"",
"```json",
json.dumps(summary["rejected_counts"], indent=2, sort_keys=True),
"```",
"",
"## Raw Preservation",
"",
"The latest JSON artifact stores raw Gamma response envelopes under `raw.gamma_events_pages`. Each normalized record has a `raw_ref` pointing back to the source event market.",
"",
"## Strongest Fake-Progress Risk",
"",
discovery["fake_progress_risk"],
"",
"## Next Smallest Step",
"",
discovery["next_step"],
"",
]
)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines), encoding="utf-8")
def write_manifest(
*,
discovery: dict[str, Any],
manifest_path: Path,
output_json: Path,
markdown_path: Path,
command: str,
) -> None:
status = discovery["summary"]["status"]
output_files = [
{
"path": output_json.as_posix(),
"kind": "latest_discovery_json",
"status": "valid" if output_json.exists() and output_json.stat().st_size else "missing",
"sha256": sha256_file(output_json) if output_json.exists() else None,
},
{
"path": markdown_path.as_posix(),
"kind": "discovery_markdown",
"status": "valid" if markdown_path.exists() and markdown_path.stat().st_size else "missing",
"sha256": sha256_file(markdown_path) if markdown_path.exists() else None,
},
]
script_path = Path("scripts/discover_polymarket_btc_markets.py")
if script_path.exists():
output_files.append(
{
"path": script_path.as_posix(),
"kind": "discovery_script",
"status": "valid",
"sha256": sha256_file(script_path),
}
)
status_codes: dict[str, int] = {}
for page in discovery["raw"]["gamma_events_pages"]:
code = str(page["response"].get("status_code"))
status_codes[code] = status_codes.get(code, 0) + 1
manifest = {
"schema_name": "polymarket_btc_markets_manifest",
"schema_version": 1,
"checkpoint_id": 3,
"checkpoint_name": "Minimal BTC Market Discovery",
"status": status,
"started_at_utc": discovery["started_at_utc"],
"ended_at_utc": discovery["ended_at_utc"],
"scope": discovery["scope"],
"command": command,
"endpoint": discovery["endpoint_basis"],
"request_counts": {
"gamma_events_pages": discovery["summary"]["raw_pages_fetched"],
"status_code_counts": dict(sorted(status_codes.items())),
},
"row_counts": {
"raw_events_fetched": discovery["summary"]["raw_events_fetched"],
"normalized_markets": discovery["summary"]["normalized_market_count"],
},
"market_ids": [
{
"market_slug": row.get("market_slug"),
"condition_id": row.get("condition_id"),
"token_ids": [token.get("token_id") for token in row.get("tokens", [])],
}
for row in discovery["normalized_markets"]
],
"output_files": output_files,
"warnings": discovery["summary"]["warnings"],
"validation": {
"summary": discovery["summary"]["status_reason"],
"required_record_fields": [
"market_name",
"market_slug",
"question",
"condition_id",
"tokens",
"outcomes",
"start_time_utc",
"end_time_utc",
"active",
"closed",
"accepting_orders",
"enable_order_book",
"endpoint_source",
"fetched_at_utc",
"raw_ref",
],
},
"fake_progress_risk": discovery["fake_progress_risk"],
"next_step": discovery["next_step"],
}
manifest_path.parent.mkdir(parents=True, exist_ok=True)
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
def write_outputs(args: argparse.Namespace, discovery: dict[str, Any]) -> None:
args.output_json.parent.mkdir(parents=True, exist_ok=True)
args.output_json.write_text(
json.dumps(discovery, indent=2, sort_keys=True) + "\n",
encoding="utf-8",
)
write_markdown(discovery, args.markdown)
command = " ".join([Path(sys.argv[0]).as_posix(), *sys.argv[1:]])
write_manifest(
discovery=discovery,
manifest_path=args.manifest,
output_json=args.output_json,
markdown_path=args.markdown,
command=command,
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Discover active BTC up/down Polymarket markets from public Gamma metadata."
)
parser.add_argument("--output-json", type=Path, default=DEFAULT_OUTPUT_JSON)
parser.add_argument("--manifest", type=Path, default=DEFAULT_MANIFEST)
parser.add_argument("--markdown", type=Path, default=DEFAULT_MARKDOWN)
parser.add_argument("--limit", type=int, default=100)
parser.add_argument("--max-pages", type=int, default=3)
parser.add_argument("--timeout", type=float, default=15.0)
parser.add_argument("--min-markets", type=int, default=1)
parser.add_argument("--allow-expired", action="store_true")
parser.add_argument("--allow-non-accepting-orders", action="store_true")
return parser.parse_args()
def main() -> int:
args = parse_args()
discovery = discover(args)
write_outputs(args, discovery)
print(
json.dumps(
{
"status": discovery["summary"]["status"],
"status_reason": discovery["summary"]["status_reason"],
"output_json": args.output_json.as_posix(),
"manifest": args.manifest.as_posix(),
"markdown": args.markdown.as_posix(),
"normalized_market_count": discovery["summary"]["normalized_market_count"],
"markets": [
{
"market_slug": row.get("market_slug"),
"condition_id": row.get("condition_id"),
"token_ids": [token.get("token_id") for token in row.get("tokens", [])],
"end_time_utc": row.get("end_time_utc"),
}
for row in discovery["normalized_markets"]
],
"warnings": discovery["summary"]["warnings"],
},
indent=2,
sort_keys=True,
)
)
return 0 if discovery["summary"]["status"] == "PASS" else 1
if __name__ == "__main__":
raise SystemExit(main())