#!/usr/bin/env bash set -uo pipefail SCRIPT_NAME="orderbooks_rclone_uploader" SCRIPT_VERSION="0.1.0" MODE="dry-run" CLEANUP_AFTER_VERIFY=0 DATA_DIR="${ORDERBOOKS_UPLOAD_DATA_DIR:-${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}}" RAW_DIR="${ORDERBOOKS_UPLOAD_RAW_DIR:-}" SOURCE_MANIFEST_DIR="${ORDERBOOKS_UPLOAD_SOURCE_MANIFEST_DIR:-}" MANIFEST_DIR="${ORDERBOOKS_UPLOAD_MANIFEST_DIR:-}" MANIFEST_PATH="${ORDERBOOKS_UPLOAD_MANIFEST_PATH:-}" DEST="${ORDERBOOKS_RCLONE_DEST:-}" RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-rclone}" MIN_AGE_SECONDS="${ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS:-600}" RETENTION_DAYS="${ORDERBOOKS_UPLOAD_RETENTION_DAYS:-7}" TRANSFERS="${ORDERBOOKS_RCLONE_TRANSFERS:-4}" CHECKERS="${ORDERBOOKS_RCLONE_CHECKERS:-8}" usage() { cat <<'EOF' Usage: scripts/upload_archive_rclone.sh [options] Uploads closed raw collector archive files and manifests with rclone. Default mode is dry-run. Real upload requires --execute and a destination. Options: --dry-run Plan and run rclone copy with --dry-run (default). --execute Run real rclone copy and rclone check. --cleanup-after-verify Delete uploaded local files older than retention only after verification. --data-dir DIR Base data directory. Default: /var/lib/orderbooks. --raw-dir DIR Raw collector output directory. Default: DATA_DIR/raw_orderbooks. --source-manifest-dir DIR Source collector manifest directory. Default: DATA_DIR/manifests. --manifest-dir DIR Upload manifest output directory. Default: DATA_DIR/manifests. --manifest-path PATH Exact upload manifest path. --dest REMOTE:PATH rclone destination. Or set ORDERBOOKS_RCLONE_DEST. --min-age-seconds N Skip files modified within N seconds. Default: 600. --retention-days N Keep at least N days locally. Default: 7. --rclone-bin PATH rclone binary path. Default: rclone. --help Show this help. EOF } while [[ $# -gt 0 ]]; do case "$1" in --dry-run) MODE="dry-run" shift ;; --execute) MODE="execute" shift ;; --cleanup-after-verify) CLEANUP_AFTER_VERIFY=1 shift ;; --data-dir) DATA_DIR="$2" shift 2 ;; --raw-dir) RAW_DIR="$2" shift 2 ;; --source-manifest-dir) SOURCE_MANIFEST_DIR="$2" shift 2 ;; --manifest-dir) MANIFEST_DIR="$2" shift 2 ;; --manifest-path) MANIFEST_PATH="$2" shift 2 ;; --dest) DEST="$2" shift 2 ;; --min-age-seconds) MIN_AGE_SECONDS="$2" shift 2 ;; --retention-days) RETENTION_DAYS="$2" shift 2 ;; --rclone-bin) RCLONE_BIN="$2" shift 2 ;; --help) usage exit 0 ;; *) echo "Unknown argument: $1" >&2 usage >&2 exit 2 ;; esac done if [[ -z "${RAW_DIR}" ]]; then RAW_DIR="${DATA_DIR%/}/raw_orderbooks" fi if [[ -z "${SOURCE_MANIFEST_DIR}" ]]; then SOURCE_MANIFEST_DIR="${DATA_DIR%/}/manifests" fi if [[ -z "${MANIFEST_DIR}" ]]; then MANIFEST_DIR="${DATA_DIR%/}/manifests" fi STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)" RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)" if [[ -z "${MANIFEST_PATH}" ]]; then MANIFEST_PATH="${MANIFEST_DIR%/}/upload_archive_${RUN_ID}.json" fi TMPDIR="$(mktemp -d)" trap 'rm -rf "${TMPDIR}"' EXIT PLAN_PATH="${TMPDIR}/plan.json" RCLONE_COPY_LOG="${TMPDIR}/rclone_copy.log" RCLONE_CHECK_LOG="${TMPDIR}/rclone_check.log" CLEANUP_PATH="${TMPDIR}/cleanup.json" STAGING_DIR="${TMPDIR}/stage" mkdir -p "$(dirname "${MANIFEST_PATH}")" "${STAGING_DIR}" python3 - "$DATA_DIR" "$RAW_DIR" "$SOURCE_MANIFEST_DIR" "$MANIFEST_PATH" "$MIN_AGE_SECONDS" "$STAGING_DIR" "$PLAN_PATH" <<'PY' import datetime as dt import hashlib import json import os import shutil import sys from pathlib import Path data_dir = Path(sys.argv[1]) raw_dir = Path(sys.argv[2]) source_manifest_dir = Path(sys.argv[3]) manifest_path = Path(sys.argv[4]).resolve() min_age_seconds = int(sys.argv[5]) staging_dir = Path(sys.argv[6]) plan_path = Path(sys.argv[7]) now = dt.datetime.now(dt.UTC) def iso_z_from_ts(ts: float) -> str: return dt.datetime.fromtimestamp(ts, dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") def sha256_file(path: Path) -> str: digest = hashlib.sha256() with path.open("rb") as handle: for chunk in iter(lambda: handle.read(1024 * 1024), b""): digest.update(chunk) return digest.hexdigest() def rel_for(path: Path) -> str: resolved = path.resolve() try: return resolved.relative_to(data_dir.resolve()).as_posix() except ValueError: return resolved.name def iter_files(root: Path): if not root.exists(): return for path in sorted(root.rglob("*")): if path.is_file(): yield path selected = [] skipped = [] warnings = [] seen = set() for root, kind in [(raw_dir, "raw"), (source_manifest_dir, "manifest")]: if not root.exists(): warnings.append(f"{kind} source directory does not exist: {root}") continue for path in iter_files(root): resolved = path.resolve() if resolved in seen: continue seen.add(resolved) rel = rel_for(path) if path.name.startswith('.') or path.suffix in {'.open', '.tmp', '.partial'} or path.name.endswith(('.open', '.tmp', '.partial')): skipped.append({ "local_path": str(path), "relative_path": rel, "kind": kind, "bytes": path.stat().st_size, "mtime_utc": iso_z_from_ts(path.stat().st_mtime), "age_seconds": max(0, int(now.timestamp() - path.stat().st_mtime)), "reason": "open_or_temporary_file", }) continue stat = path.stat() age_seconds = max(0, int(now.timestamp() - stat.st_mtime)) base = { "local_path": str(path), "relative_path": rel, "kind": kind, "bytes": stat.st_size, "mtime_utc": iso_z_from_ts(stat.st_mtime), "age_seconds": age_seconds, } if resolved == manifest_path: skipped.append({**base, "reason": "current_upload_manifest"}) continue if age_seconds < min_age_seconds: skipped.append({**base, "reason": "modified_within_min_age_seconds"}) continue checksum = sha256_file(path) staged_path = staging_dir / rel staged_path.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(path, staged_path) selected.append({**base, "sha256": checksum, "staged_path": str(staged_path)}) plan = { "selected_files": selected, "skipped_files": skipped, "warnings": warnings, } plan_path.write_text(json.dumps(plan, indent=2, sort_keys=True) + "\n", encoding="utf-8") PY RCLONE_AVAILABLE=0 RCLONE_VERSION="" if command -v "${RCLONE_BIN}" >/dev/null 2>&1; then RCLONE_AVAILABLE=1 RCLONE_VERSION="$("${RCLONE_BIN}" version 2>/dev/null | head -n 1 || true)" fi DEST_CONFIGURED=0 if [[ -n "${DEST}" ]]; then DEST_CONFIGURED=1 fi COPY_EXIT_CODE="" CHECK_EXIT_CODE="" COPY_ATTEMPTED=0 CHECK_ATTEMPTED=0 OPERATION_STATUS="PLANNED" GATE_STATUS="BLOCKED_REAL_UPLOAD" if [[ "${DEST_CONFIGURED}" -eq 0 ]]; then OPERATION_STATUS="BLOCKED_DEST_MISSING" elif [[ "${RCLONE_AVAILABLE}" -eq 0 ]]; then OPERATION_STATUS="BLOCKED_RCLONE_UNAVAILABLE" else COPY_ATTEMPTED=1 copy_args=(copy "${STAGING_DIR}/" "${DEST%/}/" --checksum --transfers "${TRANSFERS}" --checkers "${CHECKERS}") if [[ "${MODE}" == "dry-run" ]]; then copy_args+=(--dry-run) fi "${RCLONE_BIN}" "${copy_args[@]}" >"${RCLONE_COPY_LOG}" 2>&1 COPY_EXIT_CODE=$? if [[ "${COPY_EXIT_CODE}" -eq 0 && "${MODE}" == "dry-run" ]]; then OPERATION_STATUS="DRY_RUN_PASS" elif [[ "${COPY_EXIT_CODE}" -eq 0 ]]; then CHECK_ATTEMPTED=1 "${RCLONE_BIN}" check "${STAGING_DIR}/" "${DEST%/}/" --one-way --checksum >"${RCLONE_CHECK_LOG}" 2>&1 CHECK_EXIT_CODE=$? if [[ "${CHECK_EXIT_CODE}" -eq 0 ]]; then OPERATION_STATUS="UPLOAD_VERIFIED" GATE_STATUS="PASS" else OPERATION_STATUS="VERIFY_FAILED" GATE_STATUS="FAIL" fi else OPERATION_STATUS="COPY_FAILED" GATE_STATUS="FAIL" fi fi python3 - "$PLAN_PATH" "$CLEANUP_PATH" "$MODE" "$CLEANUP_AFTER_VERIFY" "$RETENTION_DAYS" "$OPERATION_STATUS" "$GATE_STATUS" <<'PY' import datetime as dt import json import sys from pathlib import Path plan_path = Path(sys.argv[1]) cleanup_path = Path(sys.argv[2]) mode = sys.argv[3] cleanup_after_verify = sys.argv[4] == "1" retention_days = int(sys.argv[5]) operation_status = sys.argv[6] gate_status = sys.argv[7] plan = json.loads(plan_path.read_text()) now = dt.datetime.now(dt.UTC) cutoff = now - dt.timedelta(days=retention_days) retained = [] deleted = [] if mode == "execute" and cleanup_after_verify and operation_status == "UPLOAD_VERIFIED": for item in plan["selected_files"]: path = Path(item["local_path"]) mtime = dt.datetime.fromtimestamp(path.stat().st_mtime, dt.UTC) if path.exists() else now if mtime < cutoff and path.exists(): path.unlink() deleted.append({**item, "deleted_at_utc": now.replace(microsecond=0).isoformat().replace("+00:00", "Z")}) else: retained.append({**item, "reason": "within_retention_window" if mtime >= cutoff else "missing_before_cleanup"}) else: reason = "cleanup_not_requested" if mode != "execute": reason = "dry_run" elif operation_status != "UPLOAD_VERIFIED": reason = "not_verified" for item in plan["selected_files"]: retained.append({**item, "reason": reason}) cleanup_path.write_text( json.dumps({"retained_local_files": retained, "deleted_local_files": deleted}, indent=2, sort_keys=True) + "\n", encoding="utf-8", ) PY ENDED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)" export SCRIPT_NAME SCRIPT_VERSION STARTED_AT ENDED_AT export MODE OPERATION_STATUS GATE_STATUS export RCLONE_BIN RCLONE_AVAILABLE RCLONE_VERSION DEST export COPY_ATTEMPTED CHECK_ATTEMPTED COPY_EXIT_CODE CHECK_EXIT_CODE export DATA_DIR RAW_DIR SOURCE_MANIFEST_DIR MIN_AGE_SECONDS RETENTION_DAYS CLEANUP_AFTER_VERIFY python3 - "$PLAN_PATH" "$CLEANUP_PATH" "$MANIFEST_PATH" <<'PY' import json import os import sys from pathlib import Path plan = json.loads(Path(sys.argv[1]).read_text()) cleanup = json.loads(Path(sys.argv[2]).read_text()) manifest_path = Path(sys.argv[3]) mode = os.environ["MODE"] operation_status = os.environ["OPERATION_STATUS"] gate_status = os.environ["GATE_STATUS"] copy_attempted = os.environ["COPY_ATTEMPTED"] == "1" check_attempted = os.environ["CHECK_ATTEMPTED"] == "1" copy_exit_code = os.environ["COPY_EXIT_CODE"] check_exit_code = os.environ["CHECK_EXIT_CODE"] dest = os.environ["DEST"] def public_item(item): public = dict(item) public.pop("staged_path", None) return public selected = [public_item(item) for item in plan["selected_files"]] skipped = [public_item(item) for item in plan["skipped_files"]] retained_local = [public_item(item) for item in cleanup["retained_local_files"]] deleted_local = [public_item(item) for item in cleanup["deleted_local_files"]] attempted_files = selected if copy_attempted else [] uploaded_files = selected if mode == "execute" and operation_status in {"UPLOAD_VERIFIED", "VERIFY_FAILED"} else [] verified_files = selected if mode == "execute" and operation_status == "UPLOAD_VERIFIED" else [] dry_run_files = selected if mode == "dry-run" and operation_status == "DRY_RUN_PASS" else [] manifest = { "schema_name": "upload_archive_manifest", "schema_version": 1, "checkpoint_id": 7, "checkpoint_name": "Google Drive Offload", "uploader": { "name": os.environ["SCRIPT_NAME"], "version": os.environ["SCRIPT_VERSION"], }, "started_at_utc": os.environ["STARTED_AT"], "ended_at_utc": os.environ["ENDED_AT"], "command_mode": mode, "operation_status": operation_status, "gate_status": gate_status, "rclone": { "binary": os.environ["RCLONE_BIN"], "available": os.environ["RCLONE_AVAILABLE"] == "1", "version": os.environ["RCLONE_VERSION"], "destination_configured": bool(dest), "destination": dest if dest else None, "copy_attempted": copy_attempted, "copy_exit_code": int(copy_exit_code) if copy_exit_code else None, "check_attempted": check_attempted, "check_exit_code": int(check_exit_code) if check_exit_code else None, }, "config": { "data_dir": os.environ["DATA_DIR"], "raw_dir": os.environ["RAW_DIR"], "source_manifest_dir": os.environ["SOURCE_MANIFEST_DIR"], "manifest_path": str(manifest_path), "min_age_seconds": int(os.environ["MIN_AGE_SECONDS"]), "retention_days": int(os.environ["RETENTION_DAYS"]), "cleanup_after_verify": os.environ["CLEANUP_AFTER_VERIFY"] == "1", }, "planned_files": selected, "attempted_files": attempted_files, "dry_run_files": dry_run_files, "uploaded_files": uploaded_files, "verified_files": verified_files, "skipped_open_or_recent_files": [ item for item in skipped if item.get("reason") == "modified_within_min_age_seconds" ], "skipped_files": skipped, "retained_local_files": retained_local, "deleted_local_files": deleted_local, "counts": { "planned": len(selected), "attempted": len(attempted_files), "dry_run": len(dry_run_files), "uploaded": len(uploaded_files), "verified": len(verified_files), "skipped": len(skipped), "retained_local": len(retained_local), "deleted_local": len(deleted_local), }, "warnings": plan["warnings"], "known_gaps": [ "A dry-run does not prove remote write access.", "Real upload requires a configured rclone remote outside the repository.", "Local files are retained unless --cleanup-after-verify is used after successful verification.", ], } if operation_status == "BLOCKED_RCLONE_UNAVAILABLE": manifest["warnings"].append("rclone binary was not available; copy and verification were not attempted.") if operation_status == "BLOCKED_DEST_MISSING": manifest["warnings"].append("No rclone destination was configured; set --dest or ORDERBOOKS_RCLONE_DEST.") if mode == "dry-run": manifest["warnings"].append("Dry-run mode does not perform a real upload; checkpoint real-upload gate remains blocked.") manifest_path.parent.mkdir(parents=True, exist_ok=True) manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") print( json.dumps( { "gate_status": gate_status, "operation_status": operation_status, "manifest_path": str(manifest_path), "planned_files": len(selected), "attempted_files": len(attempted_files), "uploaded_files": len(uploaded_files), "verified_files": len(verified_files), "skipped_files": len(plan["skipped_files"]), }, indent=2, sort_keys=True, ) ) PY case "${OPERATION_STATUS}" in UPLOAD_VERIFIED|DRY_RUN_PASS) exit 0 ;; BLOCKED_DEST_MISSING) echo "No rclone destination configured. Set --dest or ORDERBOOKS_RCLONE_DEST." >&2 exit 2 ;; BLOCKED_RCLONE_UNAVAILABLE) echo "rclone is not available. Install rclone before running dry-run or execute mode." >&2 exit 3 ;; *) echo "Upload operation failed with status: ${OPERATION_STATUS}" >&2 exit 1 ;; esac