#!/usr/bin/env bash set -uo pipefail SCRIPT_NAME="orderbooks_verified_file_purger" SCRIPT_VERSION="0.1.0" MODE="dry-run" DATA_DIR="${ORDERBOOKS_UPLOAD_DATA_DIR:-${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}}" MANIFEST_DIR="${ORDERBOOKS_UPLOAD_MANIFEST_DIR:-}" MANIFEST_PATH="${ORDERBOOKS_PURGE_MANIFEST_PATH:-}" VERIFIED_INDEX_PATH="${ORDERBOOKS_UPLOAD_VERIFIED_INDEX_PATH:-}" RETENTION_DAYS="${ORDERBOOKS_UPLOAD_RETENTION_DAYS:-7}" usage() { cat <<'EOF' Usage: scripts/purge_uploaded_local_files.sh [options] Deletes local files only when they have prior verified-upload evidence in the verified-upload index and are older than the retention window. Options: --dry-run Plan purge only (default). --execute Delete eligible local files. --data-dir DIR Base data directory. Default: /var/lib/orderbooks. --manifest-dir DIR Purge manifest output directory. Default: DATA_DIR/manifests. --manifest-path PATH Exact purge manifest path. --verified-index-path PATH Verified-upload index path. Default: MANIFEST_DIR/upload_verified_index.json. --retention-days N Keep at least N days locally. Default: 7. --help Show this help. EOF } while [[ $# -gt 0 ]]; do case "$1" in --dry-run) MODE="dry-run" shift ;; --execute) MODE="execute" shift ;; --data-dir) DATA_DIR="$2" shift 2 ;; --manifest-dir) MANIFEST_DIR="$2" shift 2 ;; --manifest-path) MANIFEST_PATH="$2" shift 2 ;; --verified-index-path) VERIFIED_INDEX_PATH="$2" shift 2 ;; --retention-days) RETENTION_DAYS="$2" shift 2 ;; --help) usage exit 0 ;; *) echo "Unknown argument: $1" >&2 usage >&2 exit 2 ;; esac done if [[ -z "${MANIFEST_DIR}" ]]; then MANIFEST_DIR="${DATA_DIR%/}/manifests" fi if [[ -z "${VERIFIED_INDEX_PATH}" ]]; then VERIFIED_INDEX_PATH="${MANIFEST_DIR%/}/upload_verified_index.json" fi STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)" RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)" if [[ -z "${MANIFEST_PATH}" ]]; then MANIFEST_PATH="${MANIFEST_DIR%/}/purge_uploaded_local_${RUN_ID}.json" fi mkdir -p "$(dirname "${MANIFEST_PATH}")" export SCRIPT_NAME SCRIPT_VERSION MODE DATA_DIR MANIFEST_DIR MANIFEST_PATH VERIFIED_INDEX_PATH RETENTION_DAYS STARTED_AT python3 - <<'PY' import datetime as dt import hashlib import json import os import sys import tempfile from pathlib import Path script_name = os.environ["SCRIPT_NAME"] script_version = os.environ["SCRIPT_VERSION"] mode = os.environ["MODE"] data_dir = Path(os.environ["DATA_DIR"]).resolve() manifest_dir = Path(os.environ["MANIFEST_DIR"]).resolve() manifest_path = Path(os.environ["MANIFEST_PATH"]).resolve() verified_index_path = Path(os.environ["VERIFIED_INDEX_PATH"]).resolve() retention_days = int(os.environ["RETENTION_DAYS"]) started_at = os.environ["STARTED_AT"] now = dt.datetime.now(dt.UTC) ended_at = now.replace(microsecond=0).isoformat().replace("+00:00", "Z") cutoff = now - dt.timedelta(days=retention_days) def iso_z_from_ts(ts: float) -> str: return dt.datetime.fromtimestamp(ts, dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") def sha256_file(path: Path) -> str: digest = hashlib.sha256() with path.open("rb") as handle: for chunk in iter(lambda: handle.read(1024 * 1024), b""): digest.update(chunk) return digest.hexdigest() def write_atomic_json(path: Path, payload: dict) -> None: path.parent.mkdir(parents=True, exist_ok=True) with tempfile.NamedTemporaryFile("w", encoding="utf-8", dir=str(path.parent), delete=False) as tmp: json.dump(payload, tmp, indent=2, sort_keys=True) tmp.write("\n") tmp_path = Path(tmp.name) os.replace(tmp_path, path) manifest = { "schema_name": "purge_uploaded_local_manifest", "schema_version": 1, "purger": {"name": script_name, "version": script_version}, "started_at_utc": started_at, "ended_at_utc": ended_at, "command_mode": mode, "operation_status": None, "gate_status": "PASS", "config": { "data_dir": str(data_dir), "manifest_dir": str(manifest_dir), "manifest_path": str(manifest_path), "verified_index_path": str(verified_index_path), "retention_days": retention_days, }, "verified_index": { "path": str(verified_index_path), "exists": verified_index_path.exists(), "record_count_before": 0, "record_count_after": 0, }, "candidate_files": [], "deleted_local_files": [], "skipped_files": [], "counts": { "eligible": 0, "deleted": 0, "within_retention": 0, "already_absent": 0, "protected": 0, "sha256_mismatch": 0, "invalid_records": 0, }, "warnings": [], "known_gaps": [ "Purge trusts prior verified-upload evidence in the local verified index and does not re-run rclone copy/check during deletion.", "Protected local state files, including the verified-upload index itself, are not deleted by this script.", ], } if not verified_index_path.exists(): manifest["operation_status"] = "NO_VERIFIED_INDEX" write_atomic_json(manifest_path, manifest) print(json.dumps({ "gate_status": manifest["gate_status"], "operation_status": manifest["operation_status"], "manifest_path": str(manifest_path), "eligible_files": 0, "deleted_files": 0, }, indent=2, sort_keys=True)) sys.exit(0) try: index_doc = json.loads(verified_index_path.read_text(encoding="utf-8")) except Exception as exc: manifest["operation_status"] = "INDEX_READ_FAILED" manifest["gate_status"] = "FAIL" manifest["warnings"].append(f"failed to read verified-upload index: {exc}") write_atomic_json(manifest_path, manifest) print(json.dumps({ "gate_status": manifest["gate_status"], "operation_status": manifest["operation_status"], "manifest_path": str(manifest_path), "eligible_files": 0, "deleted_files": 0, }, indent=2, sort_keys=True)) sys.exit(1) records = index_doc.get("records", []) manifest["verified_index"]["record_count_before"] = len(records) protected_path = verified_index_path.resolve() index_changed = False deleted_at = ended_at for record in records: relative_path = record.get("relative_path") sha256_expected = record.get("sha256") if not relative_path or not sha256_expected: manifest["counts"]["invalid_records"] += 1 manifest["skipped_files"].append({ "relative_path": relative_path, "reason": "invalid_index_record", }) continue local_path = (data_dir / relative_path).resolve() if local_path == protected_path: manifest["counts"]["protected"] += 1 continue if not local_path.exists(): manifest["counts"]["already_absent"] += 1 if mode == "execute" and record.get("local_deleted_at_utc") is None: record["local_deleted_at_utc"] = deleted_at index_changed = True continue stat = local_path.stat() mtime = dt.datetime.fromtimestamp(stat.st_mtime, dt.UTC) if mtime >= cutoff: manifest["counts"]["within_retention"] += 1 continue sha256_actual = sha256_file(local_path) if sha256_actual != sha256_expected: manifest["counts"]["sha256_mismatch"] += 1 manifest["skipped_files"].append({ "relative_path": relative_path, "local_path": str(local_path), "kind": record.get("kind"), "reason": "sha256_mismatch", "expected_sha256": sha256_expected, "actual_sha256": sha256_actual, }) continue candidate = { "relative_path": relative_path, "local_path": str(local_path), "kind": record.get("kind"), "bytes": stat.st_size, "mtime_utc": iso_z_from_ts(stat.st_mtime), "sha256": sha256_actual, "first_verified_at_utc": record.get("first_verified_at_utc"), "last_verified_at_utc": record.get("last_verified_at_utc"), "last_verified_by_manifest": record.get("last_verified_by_manifest"), } manifest["candidate_files"].append(candidate) manifest["counts"]["eligible"] += 1 if mode == "execute": local_path.unlink() record["local_deleted_at_utc"] = deleted_at index_changed = True manifest["deleted_local_files"].append({**candidate, "deleted_at_utc": deleted_at}) manifest["counts"]["deleted"] = len(manifest["deleted_local_files"]) manifest["verified_index"]["record_count_after"] = len(records) if mode == "execute" and index_changed: index_doc["updated_at_utc"] = ended_at write_atomic_json(verified_index_path, index_doc) if manifest["operation_status"] is None: if manifest["counts"]["eligible"] == 0: manifest["operation_status"] = "NO_ELIGIBLE_FILES" elif mode == "dry-run": manifest["operation_status"] = "DRY_RUN_PASS" else: manifest["operation_status"] = "PURGE_PASS" write_atomic_json(manifest_path, manifest) print(json.dumps({ "gate_status": manifest["gate_status"], "operation_status": manifest["operation_status"], "manifest_path": str(manifest_path), "eligible_files": manifest["counts"]["eligible"], "deleted_files": manifest["counts"]["deleted"], }, indent=2, sort_keys=True)) if manifest["gate_status"] != "PASS": sys.exit(1) PY