orderbooks/scripts/upload_archive_rclone.sh
2026-04-19 19:17:56 +02:00

473 lines
15 KiB
Bash
Executable file

#!/usr/bin/env bash
set -uo pipefail
SCRIPT_NAME="orderbooks_rclone_uploader"
SCRIPT_VERSION="0.1.0"
MODE="dry-run"
CLEANUP_AFTER_VERIFY=0
DATA_DIR="${ORDERBOOKS_UPLOAD_DATA_DIR:-${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}}"
RAW_DIR="${ORDERBOOKS_UPLOAD_RAW_DIR:-}"
SOURCE_MANIFEST_DIR="${ORDERBOOKS_UPLOAD_SOURCE_MANIFEST_DIR:-}"
MANIFEST_DIR="${ORDERBOOKS_UPLOAD_MANIFEST_DIR:-}"
MANIFEST_PATH="${ORDERBOOKS_UPLOAD_MANIFEST_PATH:-}"
DEST="${ORDERBOOKS_RCLONE_DEST:-}"
RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-rclone}"
MIN_AGE_SECONDS="${ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS:-600}"
RETENTION_DAYS="${ORDERBOOKS_UPLOAD_RETENTION_DAYS:-7}"
TRANSFERS="${ORDERBOOKS_RCLONE_TRANSFERS:-4}"
CHECKERS="${ORDERBOOKS_RCLONE_CHECKERS:-8}"
usage() {
cat <<'EOF'
Usage: scripts/upload_archive_rclone.sh [options]
Uploads closed raw collector archive files and manifests with rclone.
Default mode is dry-run. Real upload requires --execute and a destination.
Options:
--dry-run Plan and run rclone copy with --dry-run (default).
--execute Run real rclone copy and rclone check.
--cleanup-after-verify Delete uploaded local files older than retention only after verification.
--data-dir DIR Base data directory. Default: /var/lib/orderbooks.
--raw-dir DIR Raw collector output directory. Default: DATA_DIR/raw_orderbooks.
--source-manifest-dir DIR Source collector manifest directory. Default: DATA_DIR/manifests.
--manifest-dir DIR Upload manifest output directory. Default: DATA_DIR/manifests.
--manifest-path PATH Exact upload manifest path.
--dest REMOTE:PATH rclone destination. Or set ORDERBOOKS_RCLONE_DEST.
--min-age-seconds N Skip files modified within N seconds. Default: 600.
--retention-days N Keep at least N days locally. Default: 7.
--rclone-bin PATH rclone binary path. Default: rclone.
--help Show this help.
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--dry-run)
MODE="dry-run"
shift
;;
--execute)
MODE="execute"
shift
;;
--cleanup-after-verify)
CLEANUP_AFTER_VERIFY=1
shift
;;
--data-dir)
DATA_DIR="$2"
shift 2
;;
--raw-dir)
RAW_DIR="$2"
shift 2
;;
--source-manifest-dir)
SOURCE_MANIFEST_DIR="$2"
shift 2
;;
--manifest-dir)
MANIFEST_DIR="$2"
shift 2
;;
--manifest-path)
MANIFEST_PATH="$2"
shift 2
;;
--dest)
DEST="$2"
shift 2
;;
--min-age-seconds)
MIN_AGE_SECONDS="$2"
shift 2
;;
--retention-days)
RETENTION_DAYS="$2"
shift 2
;;
--rclone-bin)
RCLONE_BIN="$2"
shift 2
;;
--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if [[ -z "${RAW_DIR}" ]]; then
RAW_DIR="${DATA_DIR%/}/raw_orderbooks"
fi
if [[ -z "${SOURCE_MANIFEST_DIR}" ]]; then
SOURCE_MANIFEST_DIR="${DATA_DIR%/}/manifests"
fi
if [[ -z "${MANIFEST_DIR}" ]]; then
MANIFEST_DIR="${DATA_DIR%/}/manifests"
fi
STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
if [[ -z "${MANIFEST_PATH}" ]]; then
MANIFEST_PATH="${MANIFEST_DIR%/}/upload_archive_${RUN_ID}.json"
fi
TMPDIR="$(mktemp -d)"
trap 'rm -rf "${TMPDIR}"' EXIT
PLAN_PATH="${TMPDIR}/plan.json"
RCLONE_COPY_LOG="${TMPDIR}/rclone_copy.log"
RCLONE_CHECK_LOG="${TMPDIR}/rclone_check.log"
CLEANUP_PATH="${TMPDIR}/cleanup.json"
STAGING_DIR="${TMPDIR}/stage"
mkdir -p "$(dirname "${MANIFEST_PATH}")" "${STAGING_DIR}"
python3 - "$DATA_DIR" "$RAW_DIR" "$SOURCE_MANIFEST_DIR" "$MANIFEST_PATH" "$MIN_AGE_SECONDS" "$STAGING_DIR" "$PLAN_PATH" <<'PY'
import datetime as dt
import hashlib
import json
import os
import shutil
import sys
from pathlib import Path
data_dir = Path(sys.argv[1])
raw_dir = Path(sys.argv[2])
source_manifest_dir = Path(sys.argv[3])
manifest_path = Path(sys.argv[4]).resolve()
min_age_seconds = int(sys.argv[5])
staging_dir = Path(sys.argv[6])
plan_path = Path(sys.argv[7])
now = dt.datetime.now(dt.UTC)
def iso_z_from_ts(ts: float) -> str:
return dt.datetime.fromtimestamp(ts, dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
def sha256_file(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def rel_for(path: Path) -> str:
resolved = path.resolve()
try:
return resolved.relative_to(data_dir.resolve()).as_posix()
except ValueError:
return resolved.name
def iter_files(root: Path):
if not root.exists():
return
for path in sorted(root.rglob("*")):
if path.is_file():
yield path
selected = []
skipped = []
warnings = []
seen = set()
for root, kind in [(raw_dir, "raw"), (source_manifest_dir, "manifest")]:
if not root.exists():
warnings.append(f"{kind} source directory does not exist: {root}")
continue
for path in iter_files(root):
resolved = path.resolve()
if resolved in seen:
continue
seen.add(resolved)
rel = rel_for(path)
if path.name.startswith('.') or path.suffix in {'.open', '.tmp', '.partial'} or path.name.endswith(('.open', '.tmp', '.partial')):
skipped.append({
"local_path": str(path),
"relative_path": rel,
"kind": kind,
"bytes": path.stat().st_size,
"mtime_utc": iso_z_from_ts(path.stat().st_mtime),
"age_seconds": max(0, int(now.timestamp() - path.stat().st_mtime)),
"reason": "open_or_temporary_file",
})
continue
stat = path.stat()
age_seconds = max(0, int(now.timestamp() - stat.st_mtime))
base = {
"local_path": str(path),
"relative_path": rel,
"kind": kind,
"bytes": stat.st_size,
"mtime_utc": iso_z_from_ts(stat.st_mtime),
"age_seconds": age_seconds,
}
if resolved == manifest_path:
skipped.append({**base, "reason": "current_upload_manifest"})
continue
if age_seconds < min_age_seconds:
skipped.append({**base, "reason": "modified_within_min_age_seconds"})
continue
checksum = sha256_file(path)
staged_path = staging_dir / rel
staged_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(path, staged_path)
selected.append({**base, "sha256": checksum, "staged_path": str(staged_path)})
plan = {
"selected_files": selected,
"skipped_files": skipped,
"warnings": warnings,
}
plan_path.write_text(json.dumps(plan, indent=2, sort_keys=True) + "\n", encoding="utf-8")
PY
RCLONE_AVAILABLE=0
RCLONE_VERSION=""
if command -v "${RCLONE_BIN}" >/dev/null 2>&1; then
RCLONE_AVAILABLE=1
RCLONE_VERSION="$("${RCLONE_BIN}" version 2>/dev/null | head -n 1 || true)"
fi
DEST_CONFIGURED=0
if [[ -n "${DEST}" ]]; then
DEST_CONFIGURED=1
fi
COPY_EXIT_CODE=""
CHECK_EXIT_CODE=""
COPY_ATTEMPTED=0
CHECK_ATTEMPTED=0
OPERATION_STATUS="PLANNED"
GATE_STATUS="BLOCKED_REAL_UPLOAD"
if [[ "${DEST_CONFIGURED}" -eq 0 ]]; then
OPERATION_STATUS="BLOCKED_DEST_MISSING"
elif [[ "${RCLONE_AVAILABLE}" -eq 0 ]]; then
OPERATION_STATUS="BLOCKED_RCLONE_UNAVAILABLE"
else
COPY_ATTEMPTED=1
copy_args=(copy "${STAGING_DIR}/" "${DEST%/}/" --checksum --transfers "${TRANSFERS}" --checkers "${CHECKERS}")
if [[ "${MODE}" == "dry-run" ]]; then
copy_args+=(--dry-run)
fi
"${RCLONE_BIN}" "${copy_args[@]}" >"${RCLONE_COPY_LOG}" 2>&1
COPY_EXIT_CODE=$?
if [[ "${COPY_EXIT_CODE}" -eq 0 && "${MODE}" == "dry-run" ]]; then
OPERATION_STATUS="DRY_RUN_PASS"
elif [[ "${COPY_EXIT_CODE}" -eq 0 ]]; then
CHECK_ATTEMPTED=1
"${RCLONE_BIN}" check "${STAGING_DIR}/" "${DEST%/}/" --one-way --checksum >"${RCLONE_CHECK_LOG}" 2>&1
CHECK_EXIT_CODE=$?
if [[ "${CHECK_EXIT_CODE}" -eq 0 ]]; then
OPERATION_STATUS="UPLOAD_VERIFIED"
GATE_STATUS="PASS"
else
OPERATION_STATUS="VERIFY_FAILED"
GATE_STATUS="FAIL"
fi
else
OPERATION_STATUS="COPY_FAILED"
GATE_STATUS="FAIL"
fi
fi
python3 - "$PLAN_PATH" "$CLEANUP_PATH" "$MODE" "$CLEANUP_AFTER_VERIFY" "$RETENTION_DAYS" "$OPERATION_STATUS" "$GATE_STATUS" <<'PY'
import datetime as dt
import json
import sys
from pathlib import Path
plan_path = Path(sys.argv[1])
cleanup_path = Path(sys.argv[2])
mode = sys.argv[3]
cleanup_after_verify = sys.argv[4] == "1"
retention_days = int(sys.argv[5])
operation_status = sys.argv[6]
gate_status = sys.argv[7]
plan = json.loads(plan_path.read_text())
now = dt.datetime.now(dt.UTC)
cutoff = now - dt.timedelta(days=retention_days)
retained = []
deleted = []
if mode == "execute" and cleanup_after_verify and operation_status == "UPLOAD_VERIFIED":
for item in plan["selected_files"]:
path = Path(item["local_path"])
mtime = dt.datetime.fromtimestamp(path.stat().st_mtime, dt.UTC) if path.exists() else now
if mtime < cutoff and path.exists():
path.unlink()
deleted.append({**item, "deleted_at_utc": now.replace(microsecond=0).isoformat().replace("+00:00", "Z")})
else:
retained.append({**item, "reason": "within_retention_window" if mtime >= cutoff else "missing_before_cleanup"})
else:
reason = "cleanup_not_requested"
if mode != "execute":
reason = "dry_run"
elif operation_status != "UPLOAD_VERIFIED":
reason = "not_verified"
for item in plan["selected_files"]:
retained.append({**item, "reason": reason})
cleanup_path.write_text(
json.dumps({"retained_local_files": retained, "deleted_local_files": deleted}, indent=2, sort_keys=True) + "\n",
encoding="utf-8",
)
PY
ENDED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
export SCRIPT_NAME SCRIPT_VERSION STARTED_AT ENDED_AT
export MODE OPERATION_STATUS GATE_STATUS
export RCLONE_BIN RCLONE_AVAILABLE RCLONE_VERSION DEST
export COPY_ATTEMPTED CHECK_ATTEMPTED COPY_EXIT_CODE CHECK_EXIT_CODE
export DATA_DIR RAW_DIR SOURCE_MANIFEST_DIR MIN_AGE_SECONDS RETENTION_DAYS CLEANUP_AFTER_VERIFY
python3 - "$PLAN_PATH" "$CLEANUP_PATH" "$MANIFEST_PATH" <<'PY'
import json
import os
import sys
from pathlib import Path
plan = json.loads(Path(sys.argv[1]).read_text())
cleanup = json.loads(Path(sys.argv[2]).read_text())
manifest_path = Path(sys.argv[3])
mode = os.environ["MODE"]
operation_status = os.environ["OPERATION_STATUS"]
gate_status = os.environ["GATE_STATUS"]
copy_attempted = os.environ["COPY_ATTEMPTED"] == "1"
check_attempted = os.environ["CHECK_ATTEMPTED"] == "1"
copy_exit_code = os.environ["COPY_EXIT_CODE"]
check_exit_code = os.environ["CHECK_EXIT_CODE"]
dest = os.environ["DEST"]
def public_item(item):
public = dict(item)
public.pop("staged_path", None)
return public
selected = [public_item(item) for item in plan["selected_files"]]
skipped = [public_item(item) for item in plan["skipped_files"]]
retained_local = [public_item(item) for item in cleanup["retained_local_files"]]
deleted_local = [public_item(item) for item in cleanup["deleted_local_files"]]
attempted_files = selected if copy_attempted else []
uploaded_files = selected if mode == "execute" and operation_status in {"UPLOAD_VERIFIED", "VERIFY_FAILED"} else []
verified_files = selected if mode == "execute" and operation_status == "UPLOAD_VERIFIED" else []
dry_run_files = selected if mode == "dry-run" and operation_status == "DRY_RUN_PASS" else []
manifest = {
"schema_name": "upload_archive_manifest",
"schema_version": 1,
"checkpoint_id": 7,
"checkpoint_name": "Google Drive Offload",
"uploader": {
"name": os.environ["SCRIPT_NAME"],
"version": os.environ["SCRIPT_VERSION"],
},
"started_at_utc": os.environ["STARTED_AT"],
"ended_at_utc": os.environ["ENDED_AT"],
"command_mode": mode,
"operation_status": operation_status,
"gate_status": gate_status,
"rclone": {
"binary": os.environ["RCLONE_BIN"],
"available": os.environ["RCLONE_AVAILABLE"] == "1",
"version": os.environ["RCLONE_VERSION"],
"destination_configured": bool(dest),
"destination": dest if dest else None,
"copy_attempted": copy_attempted,
"copy_exit_code": int(copy_exit_code) if copy_exit_code else None,
"check_attempted": check_attempted,
"check_exit_code": int(check_exit_code) if check_exit_code else None,
},
"config": {
"data_dir": os.environ["DATA_DIR"],
"raw_dir": os.environ["RAW_DIR"],
"source_manifest_dir": os.environ["SOURCE_MANIFEST_DIR"],
"manifest_path": str(manifest_path),
"min_age_seconds": int(os.environ["MIN_AGE_SECONDS"]),
"retention_days": int(os.environ["RETENTION_DAYS"]),
"cleanup_after_verify": os.environ["CLEANUP_AFTER_VERIFY"] == "1",
},
"planned_files": selected,
"attempted_files": attempted_files,
"dry_run_files": dry_run_files,
"uploaded_files": uploaded_files,
"verified_files": verified_files,
"skipped_open_or_recent_files": [
item for item in skipped if item.get("reason") == "modified_within_min_age_seconds"
],
"skipped_files": skipped,
"retained_local_files": retained_local,
"deleted_local_files": deleted_local,
"counts": {
"planned": len(selected),
"attempted": len(attempted_files),
"dry_run": len(dry_run_files),
"uploaded": len(uploaded_files),
"verified": len(verified_files),
"skipped": len(skipped),
"retained_local": len(retained_local),
"deleted_local": len(deleted_local),
},
"warnings": plan["warnings"],
"known_gaps": [
"A dry-run does not prove remote write access.",
"Real upload requires a configured rclone remote outside the repository.",
"Local files are retained unless --cleanup-after-verify is used after successful verification.",
],
}
if operation_status == "BLOCKED_RCLONE_UNAVAILABLE":
manifest["warnings"].append("rclone binary was not available; copy and verification were not attempted.")
if operation_status == "BLOCKED_DEST_MISSING":
manifest["warnings"].append("No rclone destination was configured; set --dest or ORDERBOOKS_RCLONE_DEST.")
if mode == "dry-run":
manifest["warnings"].append("Dry-run mode does not perform a real upload; checkpoint real-upload gate remains blocked.")
manifest_path.parent.mkdir(parents=True, exist_ok=True)
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
print(
json.dumps(
{
"gate_status": gate_status,
"operation_status": operation_status,
"manifest_path": str(manifest_path),
"planned_files": len(selected),
"attempted_files": len(attempted_files),
"uploaded_files": len(uploaded_files),
"verified_files": len(verified_files),
"skipped_files": len(plan["skipped_files"]),
},
indent=2,
sort_keys=True,
)
)
PY
case "${OPERATION_STATUS}" in
UPLOAD_VERIFIED|DRY_RUN_PASS)
exit 0
;;
BLOCKED_DEST_MISSING)
echo "No rclone destination configured. Set --dest or ORDERBOOKS_RCLONE_DEST." >&2
exit 2
;;
BLOCKED_RCLONE_UNAVAILABLE)
echo "rclone is not available. Install rclone before running dry-run or execute mode." >&2
exit 3
;;
*)
echo "Upload operation failed with status: ${OPERATION_STATUS}" >&2
exit 1
;;
esac