#!/usr/bin/env python3 """Discover active Polymarket BTC up/down markets. Checkpoint 3 scope: fetch bounded public Gamma metadata, preserve raw responses, and write normalized market records with outcome-token mappings. This is not an order-book collector. """ from __future__ import annotations import argparse import datetime as dt import hashlib import json import sys import time import urllib.error import urllib.parse import urllib.request from pathlib import Path from typing import Any GAMMA_EVENTS_URL = "https://gamma-api.polymarket.com/events" BTC_TAG_ID = 235 DEFAULT_OUTPUT_JSON = Path("data/discovery/polymarket_btc_markets_latest.json") DEFAULT_MANIFEST = Path("data/discovery/polymarket_btc_markets_manifest.json") DEFAULT_MARKDOWN = Path("data/discovery/polymarket_btc_markets.md") SAFE_RESPONSE_HEADERS = { "age", "cache-control", "cf-cache-status", "cf-ray", "content-encoding", "content-length", "content-type", "date", "expires", "last-modified", "ratelimit-limit", "ratelimit-remaining", "ratelimit-reset", "retry-after", "server", "strict-transport-security", "x-ratelimit-limit", "x-ratelimit-remaining", "x-ratelimit-reset", } FILTER_RULES = [ "Use public Gamma /events with tag_id=235, related_tags=true, active=true, closed=false.", "Require event.active=true and event.closed=false.", "Require market.active=true and market.closed=false.", "Require market.enableOrderBook=true.", "Require market.acceptingOrders=true unless --allow-non-accepting-orders is used.", "Require market end time to be after the fetch time unless --allow-expired is used.", "Require outcomes to resolve to exactly Up and Down.", "Require clobTokenIds to resolve to exactly two token IDs.", "Require BTC/up-down evidence from seriesSlug, title/slug text, or tags.", ] def utc_now() -> dt.datetime: return dt.datetime.now(dt.UTC) def iso_z(value: dt.datetime | None = None) -> str: value = value or utc_now() return value.astimezone(dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") def parse_iso(value: Any) -> dt.datetime | None: if not isinstance(value, str) or not value.strip(): return None text = value.strip() if text.endswith("Z"): text = text[:-1] + "+00:00" try: parsed = dt.datetime.fromisoformat(text) except ValueError: return None if parsed.tzinfo is None: parsed = parsed.replace(tzinfo=dt.UTC) return parsed.astimezone(dt.UTC) def sha256_file(path: Path) -> str: digest = hashlib.sha256() with path.open("rb") as handle: for chunk in iter(lambda: handle.read(1024 * 1024), b""): digest.update(chunk) return digest.hexdigest() def filter_headers(headers: Any) -> dict[str, str]: safe: dict[str, str] = {} for key, value in dict(headers).items(): if key.lower() in SAFE_RESPONSE_HEADERS: safe[key] = value return safe def normalize_params(params: dict[str, Any]) -> dict[str, Any]: normalized: dict[str, Any] = {} for key, value in params.items(): if isinstance(value, bool): normalized[key] = "true" if value else "false" else: normalized[key] = value return normalized def build_url(url: str, params: dict[str, Any]) -> str: query = urllib.parse.urlencode(normalize_params(params), doseq=True) return f"{url}?{query}" def fetch_json_page( *, name: str, url: str, params: dict[str, Any], timeout_seconds: float, ) -> dict[str, Any]: started_monotonic = time.monotonic() started_at_utc = iso_z() full_url = build_url(url, params) request = urllib.request.Request( full_url, headers={ "Accept": "application/json", "User-Agent": "orderbooks-checkpoint-3-discovery/1.0", }, method="GET", ) status_code: int | None = None response_headers: dict[str, str] = {} response_text = "" error: str | None = None try: with urllib.request.urlopen(request, timeout=timeout_seconds) as response: status_code = response.status response_headers = filter_headers(response.headers) response_text = response.read().decode("utf-8", errors="replace") except urllib.error.HTTPError as exc: status_code = exc.code response_headers = filter_headers(exc.headers) response_text = exc.read().decode("utf-8", errors="replace") error = f"HTTPError: {exc}" except Exception as exc: # noqa: BLE001 - preserve probe failure evidence error = f"{type(exc).__name__}: {exc}" response_json: Any | None = None json_error: str | None = None if response_text: try: response_json = json.loads(response_text) except json.JSONDecodeError as exc: json_error = str(exc) return { "name": name, "started_at_utc": started_at_utc, "ended_at_utc": iso_z(), "duration_ms": round((time.monotonic() - started_monotonic) * 1000, 3), "request": { "method": "GET", "url": url, "full_url": full_url, "params": normalize_params(params), }, "response": { "status_code": status_code, "headers": response_headers, "json": response_json, "json_error": json_error, "text_preview": response_text[:1000] if response_json is None else None, }, "ok": error is None and status_code is not None and 200 <= status_code < 300, "error": error, } def coerce_json_array(value: Any) -> list[Any]: if isinstance(value, list): return value if isinstance(value, str): try: parsed = json.loads(value) except json.JSONDecodeError: return [] return parsed if isinstance(parsed, list) else [] return [] def lower_text(value: Any) -> str: return str(value or "").lower() def event_tag_text(event: dict[str, Any]) -> str: parts: list[str] = [] for tag in event.get("tags") or []: if isinstance(tag, dict): parts.append(str(tag.get("slug") or "")) parts.append(str(tag.get("label") or "")) return " ".join(parts).lower() def has_btc_up_down_evidence(event: dict[str, Any], market: dict[str, Any]) -> bool: series_slug = lower_text(event.get("seriesSlug")) text = " ".join( lower_text(event.get(key)) for key in ("title", "slug", "ticker", "description") ) text += " " + " ".join( lower_text(market.get(key)) for key in ("question", "slug", "description") ) tags = event_tag_text(event) series_match = series_slug.startswith("btc-up-or-down") text_match = ("bitcoin" in text or "btc" in text) and "up" in text and "down" in text tag_match = ("bitcoin" in tags or "btc" in tags) and "up-or-down" in tags return bool(series_match or text_match or tag_match) def is_up_down_outcomes(outcomes: list[str]) -> bool: return len(outcomes) == 2 and {item.lower() for item in outcomes} == {"up", "down"} def normalize_market( *, event: dict[str, Any], market: dict[str, Any], page_index: int, event_index: int, market_index: int, fetched_at_utc: str, output_json_path: Path, ) -> dict[str, Any]: outcomes = [str(item) for item in coerce_json_array(market.get("outcomes"))] token_ids = [str(item) for item in coerce_json_array(market.get("clobTokenIds"))] tokens = [ { "outcome": outcomes[index], "token_id": token_ids[index], "outcome_index": index, } for index in range(min(len(outcomes), len(token_ids))) ] start_time = ( market.get("startDate") or market.get("startDateIso") or event.get("startDate") or event.get("creationDate") ) end_time = market.get("endDate") or market.get("endDateIso") or event.get("endDate") event_slug = event.get("slug") market_slug = market.get("slug") or event_slug return { "market_name": "polymarket", "market_slug": market_slug, "event_slug": event_slug, "title": event.get("title") or market.get("question"), "question": market.get("question") or event.get("title"), "condition_id": market.get("conditionId"), "tokens": tokens, "outcomes": outcomes, "start_time_utc": iso_z(parse_iso(start_time)) if parse_iso(start_time) else start_time, "end_time_utc": iso_z(parse_iso(end_time)) if parse_iso(end_time) else end_time, "active": market.get("active"), "closed": market.get("closed"), "event_active": event.get("active"), "event_closed": event.get("closed"), "accepting_orders": market.get("acceptingOrders"), "enable_order_book": market.get("enableOrderBook"), "endpoint_source": { "name": "gamma_events_bitcoin_tag", "method": "GET", "url": GAMMA_EVENTS_URL, "params_basis": { "tag_id": BTC_TAG_ID, "related_tags": "true", "active": "true", "closed": "false", "order": "endDate", "ascending": "true", }, }, "fetched_at_utc": fetched_at_utc, "raw_ref": { "artifact_path": output_json_path.as_posix(), "section": "raw.gamma_events_pages", "page_index": page_index, "event_index": event_index, "market_index": market_index, "json_path": f"raw.gamma_events_pages[{page_index}].response.json[{event_index}].markets[{market_index}]", }, } def rejection_reasons( *, event: dict[str, Any], market: dict[str, Any], fetched_at: dt.datetime, require_accepting_orders: bool, require_future_end: bool, ) -> list[str]: reasons: list[str] = [] outcomes = [str(item) for item in coerce_json_array(market.get("outcomes"))] token_ids = [str(item) for item in coerce_json_array(market.get("clobTokenIds"))] end_time = parse_iso(market.get("endDate") or event.get("endDate")) if event.get("active") is not True: reasons.append("event_not_active") if event.get("closed") is not False: reasons.append("event_closed") if market.get("active") is not True: reasons.append("market_not_active") if market.get("closed") is not False: reasons.append("market_closed") if market.get("enableOrderBook") is not True: reasons.append("order_book_not_enabled") if require_accepting_orders and market.get("acceptingOrders") is not True: reasons.append("not_accepting_orders") if require_future_end and (end_time is None or end_time <= fetched_at): reasons.append("not_future_end") if not is_up_down_outcomes(outcomes): reasons.append("not_up_down_outcomes") if len(token_ids) != 2: reasons.append("missing_two_clob_token_ids") if not has_btc_up_down_evidence(event, market): reasons.append("missing_btc_up_down_evidence") return reasons def discover(args: argparse.Namespace) -> dict[str, Any]: started_at_utc = iso_z() fetched_at = utc_now() fetched_at_utc = iso_z(fetched_at) raw_pages: list[dict[str, Any]] = [] normalized: list[dict[str, Any]] = [] rejected_counts: dict[str, int] = {} warnings: list[str] = [] seen_conditions: set[str] = set() for page_index in range(args.max_pages): offset = page_index * args.limit params = { "tag_id": BTC_TAG_ID, "related_tags": True, "active": True, "closed": False, "limit": args.limit, "offset": offset, "order": "endDate", "ascending": True, } page = fetch_json_page( name=f"gamma_events_bitcoin_tag_page_{page_index}", url=GAMMA_EVENTS_URL, params=params, timeout_seconds=args.timeout, ) raw_pages.append(page) payload = page["response"]["json"] if not page["ok"]: warnings.append( f"Page {page_index} request failed with status {page['response']['status_code']}: {page['error']}" ) break if not isinstance(payload, list): warnings.append(f"Page {page_index} response was not a JSON list.") break for event_index, event in enumerate(payload): if not isinstance(event, dict): rejected_counts["event_not_object"] = rejected_counts.get("event_not_object", 0) + 1 continue markets = event.get("markets") or [] if not isinstance(markets, list) or not markets: rejected_counts["missing_markets"] = rejected_counts.get("missing_markets", 0) + 1 continue for market_index, market in enumerate(markets): if not isinstance(market, dict): rejected_counts["market_not_object"] = rejected_counts.get("market_not_object", 0) + 1 continue reasons = rejection_reasons( event=event, market=market, fetched_at=fetched_at, require_accepting_orders=not args.allow_non_accepting_orders, require_future_end=not args.allow_expired, ) if reasons: for reason in reasons: rejected_counts[reason] = rejected_counts.get(reason, 0) + 1 continue condition_id = str(market.get("conditionId") or "") if condition_id in seen_conditions: rejected_counts["duplicate_condition_id"] = rejected_counts.get( "duplicate_condition_id", 0 ) + 1 continue seen_conditions.add(condition_id) normalized.append( normalize_market( event=event, market=market, page_index=page_index, event_index=event_index, market_index=market_index, fetched_at_utc=fetched_at_utc, output_json_path=args.output_json, ) ) if len(payload) < args.limit: break normalized.sort(key=lambda item: (item.get("end_time_utc") or "", item.get("market_slug") or "")) if raw_pages: last_payload = raw_pages[-1]["response"].get("json") if isinstance(last_payload, list) and len(last_payload) == args.limit and len(raw_pages) >= args.max_pages: warnings.append( "Discovery stopped at max_pages before exhausting Gamma pagination; output is bounded to the fetched pages." ) if len(normalized) < args.min_markets: warnings.append( f"Only {len(normalized)} markets passed filters; min_markets={args.min_markets}." ) status = "PASS" if len(normalized) >= args.min_markets else "FAIL" status_reason = ( f"Discovered {len(normalized)} active BTC up/down markets with condition IDs and two token IDs." if status == "PASS" else "Did not discover enough active BTC up/down markets with condition IDs and two token IDs." ) return { "schema_name": "polymarket_btc_market_discovery", "schema_version": 1, "artifact_status": "valid" if status == "PASS" else "partial", "checkpoint_id": 3, "checkpoint_name": "Minimal BTC Market Discovery", "started_at_utc": started_at_utc, "ended_at_utc": iso_z(), "fetched_at_utc": fetched_at_utc, "scope": "Bounded public Gamma metadata discovery only; no order-book collector.", "endpoint_basis": { "source_checkpoint": "Checkpoint 2", "source_report": "reports/checkpoints/checkpoint_002_polymarket_public_sources.md", "endpoint": GAMMA_EVENTS_URL, "method": "GET", "base_params": { "tag_id": BTC_TAG_ID, "related_tags": True, "active": True, "closed": False, "limit": args.limit, "order": "endDate", "ascending": True, }, }, "filter_rules": FILTER_RULES, "normalized_markets": normalized, "raw": { "gamma_events_pages": raw_pages, }, "summary": { "status": status, "status_reason": status_reason, "raw_pages_fetched": len(raw_pages), "raw_events_fetched": sum( len(page["response"].get("json") or []) for page in raw_pages if isinstance(page["response"].get("json"), list) ), "normalized_market_count": len(normalized), "rejected_counts": dict(sorted(rejected_counts.items())), "warnings": warnings, }, "fake_progress_risk": "Discovery can appear successful while silently missing markets if filters rely on stale text assumptions or bounded pagination. Raw pages and rejection counts are preserved so missed-market risk can be audited.", "next_step": "Checkpoint 4 should use this discovery output as input for a short, raw-first order-book snapshot sample; do not claim reliability until the later 24h soak test.", } def markdown_table_row(values: list[Any]) -> str: return "| " + " | ".join(str(value).replace("\n", " ") for value in values) + " |" def write_markdown(discovery: dict[str, Any], path: Path) -> None: summary = discovery["summary"] rows = discovery["normalized_markets"] lines = [ "# Polymarket BTC Markets Discovery", "", f"Artifact status: `{discovery['artifact_status']}`", "", "## Gate", "", f"Status: `{summary['status']}`", "", summary["status_reason"], "", "## Scope", "", "Bounded public Gamma metadata discovery only. No order-book collection, no trading, no private endpoints, no secrets.", "", "## Endpoint", "", f"- `GET {GAMMA_EVENTS_URL}`", "- Params: `tag_id=235`, `related_tags=true`, `active=true`, `closed=false`, `order=endDate`, `ascending=true`, bounded by `limit` and `max_pages`.", "", "## Summary", "", markdown_table_row(["Metric", "Value"]), markdown_table_row(["---", "---"]), markdown_table_row(["fetched_at_utc", discovery["fetched_at_utc"]]), markdown_table_row(["raw_pages_fetched", summary["raw_pages_fetched"]]), markdown_table_row(["raw_events_fetched", summary["raw_events_fetched"]]), markdown_table_row(["normalized_market_count", summary["normalized_market_count"]]), "", "## Markets", "", markdown_table_row( [ "market_slug", "end_time_utc", "condition_id", "outcomes", "token_ids", "accepting_orders", ] ), markdown_table_row(["---", "---", "---", "---", "---", "---"]), ] for row in rows: token_ids = [token["token_id"] for token in row["tokens"]] lines.append( markdown_table_row( [ row.get("market_slug"), row.get("end_time_utc"), row.get("condition_id"), json.dumps(row.get("outcomes")), json.dumps(token_ids), row.get("accepting_orders"), ] ) ) lines.extend( [ "", "## Warnings", "", ] ) if summary["warnings"]: for warning in summary["warnings"]: lines.append(f"- {warning}") else: lines.append("- None.") lines.extend( [ "", "## Rejection Counts", "", "```json", json.dumps(summary["rejected_counts"], indent=2, sort_keys=True), "```", "", "## Raw Preservation", "", "The latest JSON artifact stores raw Gamma response envelopes under `raw.gamma_events_pages`. Each normalized record has a `raw_ref` pointing back to the source event market.", "", "## Strongest Fake-Progress Risk", "", discovery["fake_progress_risk"], "", "## Next Smallest Step", "", discovery["next_step"], "", ] ) path.parent.mkdir(parents=True, exist_ok=True) path.write_text("\n".join(lines), encoding="utf-8") def write_manifest( *, discovery: dict[str, Any], manifest_path: Path, output_json: Path, markdown_path: Path, command: str, ) -> None: status = discovery["summary"]["status"] output_files = [ { "path": output_json.as_posix(), "kind": "latest_discovery_json", "status": "valid" if output_json.exists() and output_json.stat().st_size else "missing", "sha256": sha256_file(output_json) if output_json.exists() else None, }, { "path": markdown_path.as_posix(), "kind": "discovery_markdown", "status": "valid" if markdown_path.exists() and markdown_path.stat().st_size else "missing", "sha256": sha256_file(markdown_path) if markdown_path.exists() else None, }, ] script_path = Path("scripts/discover_polymarket_btc_markets.py") if script_path.exists(): output_files.append( { "path": script_path.as_posix(), "kind": "discovery_script", "status": "valid", "sha256": sha256_file(script_path), } ) status_codes: dict[str, int] = {} for page in discovery["raw"]["gamma_events_pages"]: code = str(page["response"].get("status_code")) status_codes[code] = status_codes.get(code, 0) + 1 manifest = { "schema_name": "polymarket_btc_markets_manifest", "schema_version": 1, "checkpoint_id": 3, "checkpoint_name": "Minimal BTC Market Discovery", "status": status, "started_at_utc": discovery["started_at_utc"], "ended_at_utc": discovery["ended_at_utc"], "scope": discovery["scope"], "command": command, "endpoint": discovery["endpoint_basis"], "request_counts": { "gamma_events_pages": discovery["summary"]["raw_pages_fetched"], "status_code_counts": dict(sorted(status_codes.items())), }, "row_counts": { "raw_events_fetched": discovery["summary"]["raw_events_fetched"], "normalized_markets": discovery["summary"]["normalized_market_count"], }, "market_ids": [ { "market_slug": row.get("market_slug"), "condition_id": row.get("condition_id"), "token_ids": [token.get("token_id") for token in row.get("tokens", [])], } for row in discovery["normalized_markets"] ], "output_files": output_files, "warnings": discovery["summary"]["warnings"], "validation": { "summary": discovery["summary"]["status_reason"], "required_record_fields": [ "market_name", "market_slug", "question", "condition_id", "tokens", "outcomes", "start_time_utc", "end_time_utc", "active", "closed", "accepting_orders", "enable_order_book", "endpoint_source", "fetched_at_utc", "raw_ref", ], }, "fake_progress_risk": discovery["fake_progress_risk"], "next_step": discovery["next_step"], } manifest_path.parent.mkdir(parents=True, exist_ok=True) manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") def write_outputs(args: argparse.Namespace, discovery: dict[str, Any]) -> None: args.output_json.parent.mkdir(parents=True, exist_ok=True) args.output_json.write_text( json.dumps(discovery, indent=2, sort_keys=True) + "\n", encoding="utf-8", ) write_markdown(discovery, args.markdown) command = " ".join([Path(sys.argv[0]).as_posix(), *sys.argv[1:]]) write_manifest( discovery=discovery, manifest_path=args.manifest, output_json=args.output_json, markdown_path=args.markdown, command=command, ) def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Discover active BTC up/down Polymarket markets from public Gamma metadata." ) parser.add_argument("--output-json", type=Path, default=DEFAULT_OUTPUT_JSON) parser.add_argument("--manifest", type=Path, default=DEFAULT_MANIFEST) parser.add_argument("--markdown", type=Path, default=DEFAULT_MARKDOWN) parser.add_argument("--limit", type=int, default=100) parser.add_argument("--max-pages", type=int, default=3) parser.add_argument("--timeout", type=float, default=15.0) parser.add_argument("--min-markets", type=int, default=1) parser.add_argument("--allow-expired", action="store_true") parser.add_argument("--allow-non-accepting-orders", action="store_true") return parser.parse_args() def main() -> int: args = parse_args() discovery = discover(args) write_outputs(args, discovery) print( json.dumps( { "status": discovery["summary"]["status"], "status_reason": discovery["summary"]["status_reason"], "output_json": args.output_json.as_posix(), "manifest": args.manifest.as_posix(), "markdown": args.markdown.as_posix(), "normalized_market_count": discovery["summary"]["normalized_market_count"], "markets": [ { "market_slug": row.get("market_slug"), "condition_id": row.get("condition_id"), "token_ids": [token.get("token_id") for token in row.get("tokens", [])], "end_time_utc": row.get("end_time_utc"), } for row in discovery["normalized_markets"] ], "warnings": discovery["summary"]["warnings"], }, indent=2, sort_keys=True, ) ) return 0 if discovery["summary"]["status"] == "PASS" else 1 if __name__ == "__main__": raise SystemExit(main())