orderbooks/scripts/purge_uploaded_local_files.sh
2026-05-02 17:44:33 +02:00

299 lines
9.4 KiB
Bash
Executable file

#!/usr/bin/env bash
set -uo pipefail
SCRIPT_NAME="orderbooks_verified_file_purger"
SCRIPT_VERSION="0.1.0"
MODE="dry-run"
DATA_DIR="${ORDERBOOKS_UPLOAD_DATA_DIR:-${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}}"
MANIFEST_DIR="${ORDERBOOKS_UPLOAD_MANIFEST_DIR:-}"
MANIFEST_PATH="${ORDERBOOKS_PURGE_MANIFEST_PATH:-}"
VERIFIED_INDEX_PATH="${ORDERBOOKS_UPLOAD_VERIFIED_INDEX_PATH:-}"
RETENTION_DAYS="${ORDERBOOKS_UPLOAD_RETENTION_DAYS:-7}"
usage() {
cat <<'EOF'
Usage: scripts/purge_uploaded_local_files.sh [options]
Deletes local files only when they have prior verified-upload evidence in the
verified-upload index and are older than the retention window.
Options:
--dry-run Plan purge only (default).
--execute Delete eligible local files.
--data-dir DIR Base data directory. Default: /var/lib/orderbooks.
--manifest-dir DIR Purge manifest output directory. Default: DATA_DIR/manifests.
--manifest-path PATH Exact purge manifest path.
--verified-index-path PATH Verified-upload index path. Default: MANIFEST_DIR/upload_verified_index.json.
--retention-days N Keep at least N days locally. Default: 7.
--help Show this help.
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--dry-run)
MODE="dry-run"
shift
;;
--execute)
MODE="execute"
shift
;;
--data-dir)
DATA_DIR="$2"
shift 2
;;
--manifest-dir)
MANIFEST_DIR="$2"
shift 2
;;
--manifest-path)
MANIFEST_PATH="$2"
shift 2
;;
--verified-index-path)
VERIFIED_INDEX_PATH="$2"
shift 2
;;
--retention-days)
RETENTION_DAYS="$2"
shift 2
;;
--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if [[ -z "${MANIFEST_DIR}" ]]; then
MANIFEST_DIR="${DATA_DIR%/}/manifests"
fi
if [[ -z "${VERIFIED_INDEX_PATH}" ]]; then
VERIFIED_INDEX_PATH="${MANIFEST_DIR%/}/upload_verified_index.json"
fi
STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
if [[ -z "${MANIFEST_PATH}" ]]; then
MANIFEST_PATH="${MANIFEST_DIR%/}/purge_uploaded_local_${RUN_ID}.json"
fi
mkdir -p "$(dirname "${MANIFEST_PATH}")"
export SCRIPT_NAME SCRIPT_VERSION MODE DATA_DIR MANIFEST_DIR MANIFEST_PATH VERIFIED_INDEX_PATH RETENTION_DAYS STARTED_AT
python3 - <<'PY'
import datetime as dt
import hashlib
import json
import os
import sys
import tempfile
from pathlib import Path
script_name = os.environ["SCRIPT_NAME"]
script_version = os.environ["SCRIPT_VERSION"]
mode = os.environ["MODE"]
data_dir = Path(os.environ["DATA_DIR"]).resolve()
manifest_dir = Path(os.environ["MANIFEST_DIR"]).resolve()
manifest_path = Path(os.environ["MANIFEST_PATH"]).resolve()
verified_index_path = Path(os.environ["VERIFIED_INDEX_PATH"]).resolve()
retention_days = int(os.environ["RETENTION_DAYS"])
started_at = os.environ["STARTED_AT"]
now = dt.datetime.now(dt.UTC)
ended_at = now.replace(microsecond=0).isoformat().replace("+00:00", "Z")
cutoff = now - dt.timedelta(days=retention_days)
def iso_z_from_ts(ts: float) -> str:
return dt.datetime.fromtimestamp(ts, dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
def sha256_file(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def write_atomic_json(path: Path, payload: dict) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with tempfile.NamedTemporaryFile("w", encoding="utf-8", dir=str(path.parent), delete=False) as tmp:
json.dump(payload, tmp, indent=2, sort_keys=True)
tmp.write("\n")
tmp_path = Path(tmp.name)
os.replace(tmp_path, path)
manifest = {
"schema_name": "purge_uploaded_local_manifest",
"schema_version": 1,
"purger": {"name": script_name, "version": script_version},
"started_at_utc": started_at,
"ended_at_utc": ended_at,
"command_mode": mode,
"operation_status": None,
"gate_status": "PASS",
"config": {
"data_dir": str(data_dir),
"manifest_dir": str(manifest_dir),
"manifest_path": str(manifest_path),
"verified_index_path": str(verified_index_path),
"retention_days": retention_days,
},
"verified_index": {
"path": str(verified_index_path),
"exists": verified_index_path.exists(),
"record_count_before": 0,
"record_count_after": 0,
},
"candidate_files": [],
"deleted_local_files": [],
"skipped_files": [],
"counts": {
"eligible": 0,
"deleted": 0,
"within_retention": 0,
"already_absent": 0,
"protected": 0,
"sha256_mismatch": 0,
"invalid_records": 0,
},
"warnings": [],
"known_gaps": [
"Purge trusts prior verified-upload evidence in the local verified index and does not re-run rclone copy/check during deletion.",
"Protected local state files, including the verified-upload index itself, are not deleted by this script.",
],
}
if not verified_index_path.exists():
manifest["operation_status"] = "NO_VERIFIED_INDEX"
write_atomic_json(manifest_path, manifest)
print(json.dumps({
"gate_status": manifest["gate_status"],
"operation_status": manifest["operation_status"],
"manifest_path": str(manifest_path),
"eligible_files": 0,
"deleted_files": 0,
}, indent=2, sort_keys=True))
sys.exit(0)
try:
index_doc = json.loads(verified_index_path.read_text(encoding="utf-8"))
except Exception as exc:
manifest["operation_status"] = "INDEX_READ_FAILED"
manifest["gate_status"] = "FAIL"
manifest["warnings"].append(f"failed to read verified-upload index: {exc}")
write_atomic_json(manifest_path, manifest)
print(json.dumps({
"gate_status": manifest["gate_status"],
"operation_status": manifest["operation_status"],
"manifest_path": str(manifest_path),
"eligible_files": 0,
"deleted_files": 0,
}, indent=2, sort_keys=True))
sys.exit(1)
records = index_doc.get("records", [])
manifest["verified_index"]["record_count_before"] = len(records)
protected_path = verified_index_path.resolve()
index_changed = False
deleted_at = ended_at
for record in records:
relative_path = record.get("relative_path")
sha256_expected = record.get("sha256")
if not relative_path or not sha256_expected:
manifest["counts"]["invalid_records"] += 1
manifest["skipped_files"].append({
"relative_path": relative_path,
"reason": "invalid_index_record",
})
continue
local_path = (data_dir / relative_path).resolve()
if local_path == protected_path:
manifest["counts"]["protected"] += 1
continue
if not local_path.exists():
manifest["counts"]["already_absent"] += 1
if mode == "execute" and record.get("local_deleted_at_utc") is None:
record["local_deleted_at_utc"] = deleted_at
index_changed = True
continue
stat = local_path.stat()
mtime = dt.datetime.fromtimestamp(stat.st_mtime, dt.UTC)
if mtime >= cutoff:
manifest["counts"]["within_retention"] += 1
continue
sha256_actual = sha256_file(local_path)
if sha256_actual != sha256_expected:
manifest["counts"]["sha256_mismatch"] += 1
manifest["skipped_files"].append({
"relative_path": relative_path,
"local_path": str(local_path),
"kind": record.get("kind"),
"reason": "sha256_mismatch",
"expected_sha256": sha256_expected,
"actual_sha256": sha256_actual,
})
continue
candidate = {
"relative_path": relative_path,
"local_path": str(local_path),
"kind": record.get("kind"),
"bytes": stat.st_size,
"mtime_utc": iso_z_from_ts(stat.st_mtime),
"sha256": sha256_actual,
"first_verified_at_utc": record.get("first_verified_at_utc"),
"last_verified_at_utc": record.get("last_verified_at_utc"),
"last_verified_by_manifest": record.get("last_verified_by_manifest"),
}
manifest["candidate_files"].append(candidate)
manifest["counts"]["eligible"] += 1
if mode == "execute":
local_path.unlink()
record["local_deleted_at_utc"] = deleted_at
index_changed = True
manifest["deleted_local_files"].append({**candidate, "deleted_at_utc": deleted_at})
manifest["counts"]["deleted"] = len(manifest["deleted_local_files"])
manifest["verified_index"]["record_count_after"] = len(records)
if mode == "execute" and index_changed:
index_doc["updated_at_utc"] = ended_at
write_atomic_json(verified_index_path, index_doc)
if manifest["operation_status"] is None:
if manifest["counts"]["eligible"] == 0:
manifest["operation_status"] = "NO_ELIGIBLE_FILES"
elif mode == "dry-run":
manifest["operation_status"] = "DRY_RUN_PASS"
else:
manifest["operation_status"] = "PURGE_PASS"
write_atomic_json(manifest_path, manifest)
print(json.dumps({
"gate_status": manifest["gate_status"],
"operation_status": manifest["operation_status"],
"manifest_path": str(manifest_path),
"eligible_files": manifest["counts"]["eligible"],
"deleted_files": manifest["counts"]["deleted"],
}, indent=2, sort_keys=True))
if manifest["gate_status"] != "PASS":
sys.exit(1)
PY