orderbooks/scripts/run_polymarket_24h_soak.sh
philipp 284e465588
Some checks failed
deploy / deploy (push) Has been cancelled
Prepare Kubernetes orderbooks deployment
2026-04-18 11:23:28 +02:00

362 lines
12 KiB
Bash
Executable file

#!/usr/bin/env bash
set -uo pipefail
APP_DIR="${ORDERBOOKS_APP_DIR:-$(pwd)}"
PYTHON_BIN="${ORDERBOOKS_PYTHON:-python3}"
RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-/usr/bin/rclone}"
RCLONE_DEST_BASE="${ORDERBOOKS_RCLONE_DEST:-gdrive:orderbooks/polymarket/soak-test}"
SOAK_DATE="${ORDERBOOKS_SOAK_DATE:-$(date -u +%F)}"
SOAK_ID="${ORDERBOOKS_SOAK_ID:-soak_test_${SOAK_DATE}}"
SOAK_SECONDS="${ORDERBOOKS_SOAK_SECONDS:-86400}"
CYCLE_SECONDS="${ORDERBOOKS_SOAK_CYCLE_SECONDS:-300}"
INTERVAL_SECONDS="${ORDERBOOKS_SOAK_INTERVAL_SECONDS:-30}"
MARKET_LIMIT="${ORDERBOOKS_SOAK_MARKET_LIMIT:-2}"
MARKET_END_SAFETY_SECONDS="${ORDERBOOKS_SOAK_MARKET_END_SAFETY_SECONDS:-420}"
REQUEST_TIMEOUT_SECONDS="${ORDERBOOKS_SOAK_REQUEST_TIMEOUT_SECONDS:-15}"
MAX_RETRIES="${ORDERBOOKS_SOAK_MAX_RETRIES:-2}"
BACKOFF_SECONDS="${ORDERBOOKS_SOAK_BACKOFF_SECONDS:-2}"
DISCOVERY_LIMIT="${ORDERBOOKS_SOAK_DISCOVERY_LIMIT:-100}"
DISCOVERY_MAX_PAGES="${ORDERBOOKS_SOAK_DISCOVERY_MAX_PAGES:-3}"
DISCOVERY_TIMEOUT="${ORDERBOOKS_SOAK_DISCOVERY_TIMEOUT:-15}"
LOCAL_ROOT="${ORDERBOOKS_SOAK_LOCAL_ROOT:-data/soak_test/${SOAK_DATE}}"
MANIFEST_ROOT="${ORDERBOOKS_SOAK_MANIFEST_ROOT:-data/manifests/${SOAK_ID}}"
START_MANIFEST="${ORDERBOOKS_SOAK_START_MANIFEST:-data/manifests/${SOAK_ID}_start.json}"
FINAL_MANIFEST="${ORDERBOOKS_SOAK_FINAL_MANIFEST:-data/manifests/${SOAK_ID}_final.json}"
DISCOVERY_DIR="${LOCAL_ROOT}/discovery"
LIVE_DIR="${LOCAL_ROOT}/live_sample"
LOG_DIR="${LOCAL_ROOT}/logs"
PID_FILE="${LOCAL_ROOT}/soak.pid"
CYCLES_JSONL="${MANIFEST_ROOT}/cycles.jsonl"
LOG_FILE="${LOG_DIR}/soak.log"
REMOTE_DEST="${RCLONE_DEST_BASE%/}/${SOAK_DATE}"
STOP_REQUESTED=0
STOP_SIGNAL=""
CURRENT_CHILD_PID=""
CURRENT_PHASE="initializing"
CURRENT_CYCLE_ID=""
START_WRITTEN=0
FINAL_WRITTEN=0
cd "${APP_DIR}" || exit 2
mkdir -p "${DISCOVERY_DIR}" "${LIVE_DIR}" "${LOG_DIR}" "${MANIFEST_ROOT}" "$(dirname "${START_MANIFEST}")" "$(dirname "${FINAL_MANIFEST}")"
STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
START_EPOCH="$(date -u +%s)"
END_EPOCH="$((START_EPOCH + SOAK_SECONDS))"
EXPECTED_COMPLETION_AT="$(date -u -d "@${END_EPOCH}" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || python3 - <<PY
import datetime as dt
print(dt.datetime.fromtimestamp(${END_EPOCH}, dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z"))
PY
)"
safe_log() {
printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" >> "${LOG_FILE}" 2>/dev/null || true
}
log() {
printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" | tee -a "${LOG_FILE}"
}
handle_signal() {
local signal_name="$1"
STOP_REQUESTED=1
STOP_SIGNAL="${signal_name}"
safe_log "SIGNAL received=${signal_name} phase=${CURRENT_PHASE} cycle_id=${CURRENT_CYCLE_ID:-none}"
if [[ -n "${CURRENT_CHILD_PID}" ]] && kill -0 "${CURRENT_CHILD_PID}" 2>/dev/null; then
case "${signal_name}" in
SIGINT) kill -INT "${CURRENT_CHILD_PID}" 2>/dev/null || true ;;
SIGTERM) kill -TERM "${CURRENT_CHILD_PID}" 2>/dev/null || true ;;
SIGHUP) kill -HUP "${CURRENT_CHILD_PID}" 2>/dev/null || true ;;
esac
fi
}
write_start_manifest() {
local tmp_path="${START_MANIFEST}.tmp"
python3 - "$tmp_path" "$START_MANIFEST" <<PY
import json
import os
import pathlib
tmp_path = pathlib.Path(os.sys.argv[1])
final_path = pathlib.Path(os.sys.argv[2])
manifest = {
"schema_name": "soak_test_start_manifest",
"schema_version": 1,
"checkpoint_id": 8,
"checkpoint_name": "24h Soak Test Plan",
"status": "STARTED",
"started_at_utc": "${STARTED_AT}",
"expected_completion_at_utc": "${EXPECTED_COMPLETION_AT}",
"soak_seconds": int("${SOAK_SECONDS}"),
"cycle_seconds": int("${CYCLE_SECONDS}"),
"pid": int("$$"),
"pid_file": "${PID_FILE}",
"log_file": "${LOG_FILE}",
"local_root": "${LOCAL_ROOT}",
"manifest_root": "${MANIFEST_ROOT}",
"remote_dest": "${REMOTE_DEST}",
"raw_output_dir": "${LIVE_DIR}",
"discovery_dir": "${DISCOVERY_DIR}",
"cycles_jsonl": "${CYCLES_JSONL}",
"gate_status": "IN_PROGRESS",
"production_ready": False,
"notes": [
"This is a real 24h soak start marker, not a completion report.",
"Checkpoint 8 cannot pass until 24 real hours elapse and final metrics are validated.",
],
}
tmp_path.parent.mkdir(parents=True, exist_ok=True)
tmp_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
os.replace(tmp_path, final_path)
PY
START_WRITTEN=1
}
write_cycle_record() {
local record="$1"
printf '%s\n' "${record}" >> "${CYCLES_JSONL}"
}
write_final_manifest() {
local final_status="$1"
local gate_status="$2"
local exit_reason="$3"
local ended_at
local tmp_path
ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
tmp_path="${FINAL_MANIFEST}.tmp"
python3 - "$tmp_path" "$FINAL_MANIFEST" <<PY
import json
import os
import pathlib
tmp_path = pathlib.Path(os.sys.argv[1])
final_path = pathlib.Path(os.sys.argv[2])
cycles_path = pathlib.Path("${CYCLES_JSONL}")
cycles = []
if cycles_path.exists():
cycles = [json.loads(line) for line in cycles_path.read_text(encoding="utf-8").splitlines() if line.strip()]
manifest = {
"schema_name": "soak_test_final_manifest",
"schema_version": 1,
"checkpoint_id": 8,
"checkpoint_name": "24h Soak Test Plan",
"status": "${final_status}",
"gate_status": "${gate_status}",
"exit_reason": "${exit_reason}",
"started_at_utc": "${STARTED_AT}",
"ended_at_utc": "${ended_at}",
"expected_completion_at_utc": "${EXPECTED_COMPLETION_AT}",
"soak_seconds": int("${SOAK_SECONDS}"),
"cycle_seconds": int("${CYCLE_SECONDS}"),
"cycles": cycles,
"cycle_count": len(cycles),
"ok_cycle_count": sum(1 for cycle in cycles if cycle.get("status") == "OK"),
"error_cycle_count": sum(1 for cycle in cycles if cycle.get("status") == "ERROR"),
"interrupted_cycle_count": sum(1 for cycle in cycles if cycle.get("status") == "INTERRUPTED"),
"pid": int("$$"),
"pid_file": "${PID_FILE}",
"log_file": "${LOG_FILE}",
"local_root": "${LOCAL_ROOT}",
"manifest_root": "${MANIFEST_ROOT}",
"remote_dest": "${REMOTE_DEST}",
"stop_requested": bool(int("${STOP_REQUESTED}")),
"stop_signal": "${STOP_SIGNAL}",
"current_phase_at_exit": "${CURRENT_PHASE}",
"current_cycle_id_at_exit": "${CURRENT_CYCLE_ID}",
"production_ready": False,
"notes": [
"This marker is written by the soak controller on completion, interruption, or error.",
"Checkpoint 8 cannot be PASS until 24 real hours elapse and final metrics are validated.",
],
}
tmp_path.parent.mkdir(parents=True, exist_ok=True)
tmp_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
os.replace(tmp_path, final_path)
PY
FINAL_WRITTEN=1
}
cleanup_on_exit() {
local rc=$?
if [[ "${START_WRITTEN}" -eq 1 && "${FINAL_WRITTEN}" -eq 0 ]]; then
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
write_final_manifest "INTERRUPTED" "INTERRUPTED" "${STOP_SIGNAL:-signal}"
elif [[ "${rc}" -ne 0 ]]; then
write_final_manifest "ERROR" "ERROR" "exit_code_${rc}"
else
write_final_manifest "ERROR" "ERROR" "exited_without_final_marker"
fi
fi
if [[ -f "${PID_FILE}" ]] && [[ "$(cat "${PID_FILE}" 2>/dev/null)" == "$$" ]]; then
rm -f "${PID_FILE}"
fi
exit "${rc}"
}
run_logged() {
"$@" >> "${LOG_FILE}" 2>&1 &
CURRENT_CHILD_PID="$!"
wait "${CURRENT_CHILD_PID}"
local rc=$?
if [[ "${STOP_REQUESTED}" -eq 1 ]] && kill -0 "${CURRENT_CHILD_PID}" 2>/dev/null; then
wait "${CURRENT_CHILD_PID}"
rc=$?
fi
CURRENT_CHILD_PID=""
return "${rc}"
}
trap 'handle_signal SIGINT' INT
trap 'handle_signal SIGTERM' TERM
trap 'handle_signal SIGHUP' HUP
trap cleanup_on_exit EXIT
echo "$$" > "${PID_FILE}"
write_start_manifest
test -s "${START_MANIFEST}" || exit 3
log "START soak_id=${SOAK_ID} pid=$$ expected_completion=${EXPECTED_COMPLETION_AT}"
cycle_index=0
error_seen=0
while true; do
now_epoch="$(date -u +%s)"
remaining="$((END_EPOCH - now_epoch))"
if [[ "${remaining}" -le 0 ]]; then
break
fi
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
break
fi
if [[ "${remaining}" -lt 30 ]]; then
log "SKIP final tiny remaining window seconds=${remaining}"
break
fi
cycle_index="$((cycle_index + 1))"
cycle_id="$(date -u +%Y%m%dT%H%M%SZ)"
CURRENT_CYCLE_ID="${cycle_id}"
run_seconds="${CYCLE_SECONDS}"
if [[ "${remaining}" -lt "${run_seconds}" ]]; then
run_seconds="${remaining}"
fi
discovery_json="${DISCOVERY_DIR}/polymarket_btc_markets_${cycle_id}.json"
discovery_manifest="${DISCOVERY_DIR}/polymarket_btc_markets_manifest_${cycle_id}.json"
discovery_markdown="${DISCOVERY_DIR}/polymarket_btc_markets_${cycle_id}.md"
collector_manifest="${MANIFEST_ROOT}/collector_${cycle_id}.json"
upload_manifest="${MANIFEST_ROOT}/upload_${cycle_id}.json"
cycle_started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
log "CYCLE ${cycle_index} start id=${cycle_id} run_seconds=${run_seconds}"
discovery_exit=0
CURRENT_PHASE="discovery"
run_logged "${PYTHON_BIN}" scripts/discover_polymarket_btc_markets.py \
--output-json "${discovery_json}" \
--manifest "${discovery_manifest}" \
--markdown "${discovery_markdown}" \
--limit "${DISCOVERY_LIMIT}" \
--max-pages "${DISCOVERY_MAX_PAGES}" \
--timeout "${DISCOVERY_TIMEOUT}" || discovery_exit=$?
collector_exit=0
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
collector_exit=98
elif [[ "${discovery_exit}" -eq 0 ]]; then
CURRENT_PHASE="collector"
run_logged "${PYTHON_BIN}" scripts/collect_polymarket_orderbooks.py \
--config config/polymarket_collector.vps.example.yaml \
--discovery-path "${discovery_json}" \
--output-dir "${LIVE_DIR}" \
--manifest-path "${collector_manifest}" \
--market-limit "${MARKET_LIMIT}" \
--interval-seconds "${INTERVAL_SECONDS}" \
--duration-seconds "${run_seconds}" \
--request-timeout-seconds "${REQUEST_TIMEOUT_SECONDS}" \
--max-retries "${MAX_RETRIES}" \
--backoff-seconds "${BACKOFF_SECONDS}" \
--market-end-safety-seconds "${MARKET_END_SAFETY_SECONDS}" || collector_exit=$?
else
collector_exit=99
fi
upload_exit=0
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
upload_exit=98
elif [[ "${collector_exit}" -eq 0 ]]; then
CURRENT_PHASE="upload"
run_logged scripts/upload_archive_rclone.sh \
--execute \
--data-dir "${LOCAL_ROOT}" \
--raw-dir "${LIVE_DIR}" \
--source-manifest-dir "${MANIFEST_ROOT}" \
--manifest-dir "${MANIFEST_ROOT}" \
--manifest-path "${upload_manifest}" \
--dest "${REMOTE_DEST}" \
--min-age-seconds 0 \
--rclone-bin "${RCLONE_BIN}" || upload_exit=$?
else
upload_exit=99
fi
cycle_ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
cycle_status="INTERRUPTED"
elif [[ "${discovery_exit}" -eq 0 && "${collector_exit}" -eq 0 && "${upload_exit}" -eq 0 ]]; then
cycle_status="OK"
else
cycle_status="ERROR"
error_seen=1
fi
record="$(python3 - <<PY
import json
print(json.dumps({
"cycle_index": ${cycle_index},
"cycle_id": "${cycle_id}",
"started_at_utc": "${cycle_started_at}",
"ended_at_utc": "${cycle_ended_at}",
"run_seconds": int("${run_seconds}"),
"discovery_manifest": "${discovery_manifest}",
"collector_manifest": "${collector_manifest}",
"upload_manifest": "${upload_manifest}",
"discovery_exit": int("${discovery_exit}"),
"collector_exit": int("${collector_exit}"),
"upload_exit": int("${upload_exit}"),
"status": "${cycle_status}",
"stop_signal": "${STOP_SIGNAL}",
}, sort_keys=True))
PY
)"
write_cycle_record "${record}"
log "CYCLE ${cycle_index} end id=${cycle_id} status=${cycle_status} discovery_exit=${discovery_exit} collector_exit=${collector_exit} upload_exit=${upload_exit}"
CURRENT_PHASE="sleep"
CURRENT_CYCLE_ID=""
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
break
fi
sleep 5 &
CURRENT_CHILD_PID="$!"
wait "${CURRENT_CHILD_PID}" || true
CURRENT_CHILD_PID=""
done
CURRENT_PHASE="finalizing"
CURRENT_CYCLE_ID=""
if [[ "${STOP_REQUESTED}" -eq 1 ]]; then
write_final_manifest "INTERRUPTED" "INTERRUPTED" "${STOP_SIGNAL:-signal}"
elif [[ "${error_seen}" -eq 1 ]]; then
write_final_manifest "ERROR" "ERROR" "cycle_error"
else
write_final_manifest "COMPLETED_NEEDS_REVIEW" "NEEDS_REVIEW" "elapsed"
fi
log "END soak_id=${SOAK_ID} final_manifest=${FINAL_MANIFEST} status_written=1"