462 lines
15 KiB
Bash
Executable file
462 lines
15 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
set -uo pipefail
|
|
|
|
SCRIPT_NAME="orderbooks_rclone_uploader"
|
|
SCRIPT_VERSION="0.1.0"
|
|
|
|
MODE="dry-run"
|
|
CLEANUP_AFTER_VERIFY=0
|
|
DATA_DIR="${ORDERBOOKS_UPLOAD_DATA_DIR:-${ORDERBOOKS_DATA_DIR:-/var/lib/orderbooks}}"
|
|
RAW_DIR="${ORDERBOOKS_UPLOAD_RAW_DIR:-}"
|
|
SOURCE_MANIFEST_DIR="${ORDERBOOKS_UPLOAD_SOURCE_MANIFEST_DIR:-}"
|
|
MANIFEST_DIR="${ORDERBOOKS_UPLOAD_MANIFEST_DIR:-}"
|
|
MANIFEST_PATH="${ORDERBOOKS_UPLOAD_MANIFEST_PATH:-}"
|
|
DEST="${ORDERBOOKS_RCLONE_DEST:-}"
|
|
RCLONE_BIN="${ORDERBOOKS_RCLONE_BIN:-rclone}"
|
|
MIN_AGE_SECONDS="${ORDERBOOKS_UPLOAD_MIN_AGE_SECONDS:-600}"
|
|
RETENTION_DAYS="${ORDERBOOKS_UPLOAD_RETENTION_DAYS:-7}"
|
|
TRANSFERS="${ORDERBOOKS_RCLONE_TRANSFERS:-4}"
|
|
CHECKERS="${ORDERBOOKS_RCLONE_CHECKERS:-8}"
|
|
|
|
usage() {
|
|
cat <<'EOF'
|
|
Usage: scripts/upload_archive_rclone.sh [options]
|
|
|
|
Uploads closed raw collector archive files and manifests with rclone.
|
|
Default mode is dry-run. Real upload requires --execute and a destination.
|
|
|
|
Options:
|
|
--dry-run Plan and run rclone copy with --dry-run (default).
|
|
--execute Run real rclone copy and rclone check.
|
|
--cleanup-after-verify Delete uploaded local files older than retention only after verification.
|
|
--data-dir DIR Base data directory. Default: /var/lib/orderbooks.
|
|
--raw-dir DIR Raw collector output directory. Default: DATA_DIR/raw_orderbooks.
|
|
--source-manifest-dir DIR Source collector manifest directory. Default: DATA_DIR/manifests.
|
|
--manifest-dir DIR Upload manifest output directory. Default: DATA_DIR/manifests.
|
|
--manifest-path PATH Exact upload manifest path.
|
|
--dest REMOTE:PATH rclone destination. Or set ORDERBOOKS_RCLONE_DEST.
|
|
--min-age-seconds N Skip files modified within N seconds. Default: 600.
|
|
--retention-days N Keep at least N days locally. Default: 7.
|
|
--rclone-bin PATH rclone binary path. Default: rclone.
|
|
--help Show this help.
|
|
EOF
|
|
}
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--dry-run)
|
|
MODE="dry-run"
|
|
shift
|
|
;;
|
|
--execute)
|
|
MODE="execute"
|
|
shift
|
|
;;
|
|
--cleanup-after-verify)
|
|
CLEANUP_AFTER_VERIFY=1
|
|
shift
|
|
;;
|
|
--data-dir)
|
|
DATA_DIR="$2"
|
|
shift 2
|
|
;;
|
|
--raw-dir)
|
|
RAW_DIR="$2"
|
|
shift 2
|
|
;;
|
|
--source-manifest-dir)
|
|
SOURCE_MANIFEST_DIR="$2"
|
|
shift 2
|
|
;;
|
|
--manifest-dir)
|
|
MANIFEST_DIR="$2"
|
|
shift 2
|
|
;;
|
|
--manifest-path)
|
|
MANIFEST_PATH="$2"
|
|
shift 2
|
|
;;
|
|
--dest)
|
|
DEST="$2"
|
|
shift 2
|
|
;;
|
|
--min-age-seconds)
|
|
MIN_AGE_SECONDS="$2"
|
|
shift 2
|
|
;;
|
|
--retention-days)
|
|
RETENTION_DAYS="$2"
|
|
shift 2
|
|
;;
|
|
--rclone-bin)
|
|
RCLONE_BIN="$2"
|
|
shift 2
|
|
;;
|
|
--help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Unknown argument: $1" >&2
|
|
usage >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if [[ -z "${RAW_DIR}" ]]; then
|
|
RAW_DIR="${DATA_DIR%/}/raw_orderbooks"
|
|
fi
|
|
if [[ -z "${SOURCE_MANIFEST_DIR}" ]]; then
|
|
SOURCE_MANIFEST_DIR="${DATA_DIR%/}/manifests"
|
|
fi
|
|
if [[ -z "${MANIFEST_DIR}" ]]; then
|
|
MANIFEST_DIR="${DATA_DIR%/}/manifests"
|
|
fi
|
|
|
|
STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
|
RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
|
|
if [[ -z "${MANIFEST_PATH}" ]]; then
|
|
MANIFEST_PATH="${MANIFEST_DIR%/}/upload_archive_${RUN_ID}.json"
|
|
fi
|
|
|
|
TMPDIR="$(mktemp -d)"
|
|
trap 'rm -rf "${TMPDIR}"' EXIT
|
|
|
|
PLAN_PATH="${TMPDIR}/plan.json"
|
|
RCLONE_COPY_LOG="${TMPDIR}/rclone_copy.log"
|
|
RCLONE_CHECK_LOG="${TMPDIR}/rclone_check.log"
|
|
CLEANUP_PATH="${TMPDIR}/cleanup.json"
|
|
STAGING_DIR="${TMPDIR}/stage"
|
|
|
|
mkdir -p "$(dirname "${MANIFEST_PATH}")" "${STAGING_DIR}"
|
|
|
|
python3 - "$DATA_DIR" "$RAW_DIR" "$SOURCE_MANIFEST_DIR" "$MANIFEST_PATH" "$MIN_AGE_SECONDS" "$STAGING_DIR" "$PLAN_PATH" <<'PY'
|
|
import datetime as dt
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import shutil
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
data_dir = Path(sys.argv[1])
|
|
raw_dir = Path(sys.argv[2])
|
|
source_manifest_dir = Path(sys.argv[3])
|
|
manifest_path = Path(sys.argv[4]).resolve()
|
|
min_age_seconds = int(sys.argv[5])
|
|
staging_dir = Path(sys.argv[6])
|
|
plan_path = Path(sys.argv[7])
|
|
now = dt.datetime.now(dt.UTC)
|
|
|
|
def iso_z_from_ts(ts: float) -> str:
|
|
return dt.datetime.fromtimestamp(ts, dt.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
|
|
|
def sha256_file(path: Path) -> str:
|
|
digest = hashlib.sha256()
|
|
with path.open("rb") as handle:
|
|
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
|
digest.update(chunk)
|
|
return digest.hexdigest()
|
|
|
|
def rel_for(path: Path) -> str:
|
|
resolved = path.resolve()
|
|
try:
|
|
return resolved.relative_to(data_dir.resolve()).as_posix()
|
|
except ValueError:
|
|
return resolved.name
|
|
|
|
def iter_files(root: Path):
|
|
if not root.exists():
|
|
return
|
|
for path in sorted(root.rglob("*")):
|
|
if path.is_file():
|
|
yield path
|
|
|
|
selected = []
|
|
skipped = []
|
|
warnings = []
|
|
seen = set()
|
|
|
|
for root, kind in [(raw_dir, "raw"), (source_manifest_dir, "manifest")]:
|
|
if not root.exists():
|
|
warnings.append(f"{kind} source directory does not exist: {root}")
|
|
continue
|
|
for path in iter_files(root):
|
|
resolved = path.resolve()
|
|
if resolved in seen:
|
|
continue
|
|
seen.add(resolved)
|
|
rel = rel_for(path)
|
|
stat = path.stat()
|
|
age_seconds = max(0, int(now.timestamp() - stat.st_mtime))
|
|
base = {
|
|
"local_path": str(path),
|
|
"relative_path": rel,
|
|
"kind": kind,
|
|
"bytes": stat.st_size,
|
|
"mtime_utc": iso_z_from_ts(stat.st_mtime),
|
|
"age_seconds": age_seconds,
|
|
}
|
|
if resolved == manifest_path:
|
|
skipped.append({**base, "reason": "current_upload_manifest"})
|
|
continue
|
|
if age_seconds < min_age_seconds:
|
|
skipped.append({**base, "reason": "modified_within_min_age_seconds"})
|
|
continue
|
|
checksum = sha256_file(path)
|
|
staged_path = staging_dir / rel
|
|
staged_path.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copy2(path, staged_path)
|
|
selected.append({**base, "sha256": checksum, "staged_path": str(staged_path)})
|
|
|
|
plan = {
|
|
"selected_files": selected,
|
|
"skipped_files": skipped,
|
|
"warnings": warnings,
|
|
}
|
|
plan_path.write_text(json.dumps(plan, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
|
PY
|
|
|
|
RCLONE_AVAILABLE=0
|
|
RCLONE_VERSION=""
|
|
if command -v "${RCLONE_BIN}" >/dev/null 2>&1; then
|
|
RCLONE_AVAILABLE=1
|
|
RCLONE_VERSION="$("${RCLONE_BIN}" version 2>/dev/null | head -n 1 || true)"
|
|
fi
|
|
|
|
DEST_CONFIGURED=0
|
|
if [[ -n "${DEST}" ]]; then
|
|
DEST_CONFIGURED=1
|
|
fi
|
|
|
|
COPY_EXIT_CODE=""
|
|
CHECK_EXIT_CODE=""
|
|
COPY_ATTEMPTED=0
|
|
CHECK_ATTEMPTED=0
|
|
OPERATION_STATUS="PLANNED"
|
|
GATE_STATUS="BLOCKED_REAL_UPLOAD"
|
|
|
|
if [[ "${DEST_CONFIGURED}" -eq 0 ]]; then
|
|
OPERATION_STATUS="BLOCKED_DEST_MISSING"
|
|
elif [[ "${RCLONE_AVAILABLE}" -eq 0 ]]; then
|
|
OPERATION_STATUS="BLOCKED_RCLONE_UNAVAILABLE"
|
|
else
|
|
COPY_ATTEMPTED=1
|
|
copy_args=(copy "${STAGING_DIR}/" "${DEST%/}/" --checksum --transfers "${TRANSFERS}" --checkers "${CHECKERS}")
|
|
if [[ "${MODE}" == "dry-run" ]]; then
|
|
copy_args+=(--dry-run)
|
|
fi
|
|
"${RCLONE_BIN}" "${copy_args[@]}" >"${RCLONE_COPY_LOG}" 2>&1
|
|
COPY_EXIT_CODE=$?
|
|
if [[ "${COPY_EXIT_CODE}" -eq 0 && "${MODE}" == "dry-run" ]]; then
|
|
OPERATION_STATUS="DRY_RUN_PASS"
|
|
elif [[ "${COPY_EXIT_CODE}" -eq 0 ]]; then
|
|
CHECK_ATTEMPTED=1
|
|
"${RCLONE_BIN}" check "${STAGING_DIR}/" "${DEST%/}/" --one-way --checksum >"${RCLONE_CHECK_LOG}" 2>&1
|
|
CHECK_EXIT_CODE=$?
|
|
if [[ "${CHECK_EXIT_CODE}" -eq 0 ]]; then
|
|
OPERATION_STATUS="UPLOAD_VERIFIED"
|
|
GATE_STATUS="PASS"
|
|
else
|
|
OPERATION_STATUS="VERIFY_FAILED"
|
|
GATE_STATUS="FAIL"
|
|
fi
|
|
else
|
|
OPERATION_STATUS="COPY_FAILED"
|
|
GATE_STATUS="FAIL"
|
|
fi
|
|
fi
|
|
|
|
python3 - "$PLAN_PATH" "$CLEANUP_PATH" "$MODE" "$CLEANUP_AFTER_VERIFY" "$RETENTION_DAYS" "$OPERATION_STATUS" "$GATE_STATUS" <<'PY'
|
|
import datetime as dt
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
plan_path = Path(sys.argv[1])
|
|
cleanup_path = Path(sys.argv[2])
|
|
mode = sys.argv[3]
|
|
cleanup_after_verify = sys.argv[4] == "1"
|
|
retention_days = int(sys.argv[5])
|
|
operation_status = sys.argv[6]
|
|
gate_status = sys.argv[7]
|
|
plan = json.loads(plan_path.read_text())
|
|
now = dt.datetime.now(dt.UTC)
|
|
cutoff = now - dt.timedelta(days=retention_days)
|
|
retained = []
|
|
deleted = []
|
|
|
|
if mode == "execute" and cleanup_after_verify and operation_status == "UPLOAD_VERIFIED":
|
|
for item in plan["selected_files"]:
|
|
path = Path(item["local_path"])
|
|
mtime = dt.datetime.fromtimestamp(path.stat().st_mtime, dt.UTC) if path.exists() else now
|
|
if mtime < cutoff and path.exists():
|
|
path.unlink()
|
|
deleted.append({**item, "deleted_at_utc": now.replace(microsecond=0).isoformat().replace("+00:00", "Z")})
|
|
else:
|
|
retained.append({**item, "reason": "within_retention_window" if mtime >= cutoff else "missing_before_cleanup"})
|
|
else:
|
|
reason = "cleanup_not_requested"
|
|
if mode != "execute":
|
|
reason = "dry_run"
|
|
elif operation_status != "UPLOAD_VERIFIED":
|
|
reason = "not_verified"
|
|
for item in plan["selected_files"]:
|
|
retained.append({**item, "reason": reason})
|
|
|
|
cleanup_path.write_text(
|
|
json.dumps({"retained_local_files": retained, "deleted_local_files": deleted}, indent=2, sort_keys=True) + "\n",
|
|
encoding="utf-8",
|
|
)
|
|
PY
|
|
|
|
ENDED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
|
|
|
export SCRIPT_NAME SCRIPT_VERSION STARTED_AT ENDED_AT
|
|
export MODE OPERATION_STATUS GATE_STATUS
|
|
export RCLONE_BIN RCLONE_AVAILABLE RCLONE_VERSION DEST
|
|
export COPY_ATTEMPTED CHECK_ATTEMPTED COPY_EXIT_CODE CHECK_EXIT_CODE
|
|
export DATA_DIR RAW_DIR SOURCE_MANIFEST_DIR MIN_AGE_SECONDS RETENTION_DAYS CLEANUP_AFTER_VERIFY
|
|
|
|
python3 - "$PLAN_PATH" "$CLEANUP_PATH" "$MANIFEST_PATH" <<'PY'
|
|
import json
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
plan = json.loads(Path(sys.argv[1]).read_text())
|
|
cleanup = json.loads(Path(sys.argv[2]).read_text())
|
|
manifest_path = Path(sys.argv[3])
|
|
|
|
mode = os.environ["MODE"]
|
|
operation_status = os.environ["OPERATION_STATUS"]
|
|
gate_status = os.environ["GATE_STATUS"]
|
|
copy_attempted = os.environ["COPY_ATTEMPTED"] == "1"
|
|
check_attempted = os.environ["CHECK_ATTEMPTED"] == "1"
|
|
copy_exit_code = os.environ["COPY_EXIT_CODE"]
|
|
check_exit_code = os.environ["CHECK_EXIT_CODE"]
|
|
dest = os.environ["DEST"]
|
|
|
|
def public_item(item):
|
|
public = dict(item)
|
|
public.pop("staged_path", None)
|
|
return public
|
|
|
|
selected = [public_item(item) for item in plan["selected_files"]]
|
|
skipped = [public_item(item) for item in plan["skipped_files"]]
|
|
retained_local = [public_item(item) for item in cleanup["retained_local_files"]]
|
|
deleted_local = [public_item(item) for item in cleanup["deleted_local_files"]]
|
|
attempted_files = selected if copy_attempted else []
|
|
uploaded_files = selected if mode == "execute" and operation_status in {"UPLOAD_VERIFIED", "VERIFY_FAILED"} else []
|
|
verified_files = selected if mode == "execute" and operation_status == "UPLOAD_VERIFIED" else []
|
|
dry_run_files = selected if mode == "dry-run" and operation_status == "DRY_RUN_PASS" else []
|
|
|
|
manifest = {
|
|
"schema_name": "upload_archive_manifest",
|
|
"schema_version": 1,
|
|
"checkpoint_id": 7,
|
|
"checkpoint_name": "Google Drive Offload",
|
|
"uploader": {
|
|
"name": os.environ["SCRIPT_NAME"],
|
|
"version": os.environ["SCRIPT_VERSION"],
|
|
},
|
|
"started_at_utc": os.environ["STARTED_AT"],
|
|
"ended_at_utc": os.environ["ENDED_AT"],
|
|
"command_mode": mode,
|
|
"operation_status": operation_status,
|
|
"gate_status": gate_status,
|
|
"rclone": {
|
|
"binary": os.environ["RCLONE_BIN"],
|
|
"available": os.environ["RCLONE_AVAILABLE"] == "1",
|
|
"version": os.environ["RCLONE_VERSION"],
|
|
"destination_configured": bool(dest),
|
|
"destination": dest if dest else None,
|
|
"copy_attempted": copy_attempted,
|
|
"copy_exit_code": int(copy_exit_code) if copy_exit_code else None,
|
|
"check_attempted": check_attempted,
|
|
"check_exit_code": int(check_exit_code) if check_exit_code else None,
|
|
},
|
|
"config": {
|
|
"data_dir": os.environ["DATA_DIR"],
|
|
"raw_dir": os.environ["RAW_DIR"],
|
|
"source_manifest_dir": os.environ["SOURCE_MANIFEST_DIR"],
|
|
"manifest_path": str(manifest_path),
|
|
"min_age_seconds": int(os.environ["MIN_AGE_SECONDS"]),
|
|
"retention_days": int(os.environ["RETENTION_DAYS"]),
|
|
"cleanup_after_verify": os.environ["CLEANUP_AFTER_VERIFY"] == "1",
|
|
},
|
|
"planned_files": selected,
|
|
"attempted_files": attempted_files,
|
|
"dry_run_files": dry_run_files,
|
|
"uploaded_files": uploaded_files,
|
|
"verified_files": verified_files,
|
|
"skipped_open_or_recent_files": [
|
|
item for item in skipped if item.get("reason") == "modified_within_min_age_seconds"
|
|
],
|
|
"skipped_files": skipped,
|
|
"retained_local_files": retained_local,
|
|
"deleted_local_files": deleted_local,
|
|
"counts": {
|
|
"planned": len(selected),
|
|
"attempted": len(attempted_files),
|
|
"dry_run": len(dry_run_files),
|
|
"uploaded": len(uploaded_files),
|
|
"verified": len(verified_files),
|
|
"skipped": len(skipped),
|
|
"retained_local": len(retained_local),
|
|
"deleted_local": len(deleted_local),
|
|
},
|
|
"warnings": plan["warnings"],
|
|
"known_gaps": [
|
|
"A dry-run does not prove remote write access.",
|
|
"Real upload requires a configured rclone remote outside the repository.",
|
|
"Local files are retained unless --cleanup-after-verify is used after successful verification.",
|
|
],
|
|
}
|
|
|
|
if operation_status == "BLOCKED_RCLONE_UNAVAILABLE":
|
|
manifest["warnings"].append("rclone binary was not available; copy and verification were not attempted.")
|
|
if operation_status == "BLOCKED_DEST_MISSING":
|
|
manifest["warnings"].append("No rclone destination was configured; set --dest or ORDERBOOKS_RCLONE_DEST.")
|
|
if mode == "dry-run":
|
|
manifest["warnings"].append("Dry-run mode does not perform a real upload; checkpoint real-upload gate remains blocked.")
|
|
|
|
manifest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
|
|
|
print(
|
|
json.dumps(
|
|
{
|
|
"gate_status": gate_status,
|
|
"operation_status": operation_status,
|
|
"manifest_path": str(manifest_path),
|
|
"planned_files": len(selected),
|
|
"attempted_files": len(attempted_files),
|
|
"uploaded_files": len(uploaded_files),
|
|
"verified_files": len(verified_files),
|
|
"skipped_files": len(plan["skipped_files"]),
|
|
},
|
|
indent=2,
|
|
sort_keys=True,
|
|
)
|
|
)
|
|
PY
|
|
|
|
case "${OPERATION_STATUS}" in
|
|
UPLOAD_VERIFIED|DRY_RUN_PASS)
|
|
exit 0
|
|
;;
|
|
BLOCKED_DEST_MISSING)
|
|
echo "No rclone destination configured. Set --dest or ORDERBOOKS_RCLONE_DEST." >&2
|
|
exit 2
|
|
;;
|
|
BLOCKED_RCLONE_UNAVAILABLE)
|
|
echo "rclone is not available. Install rclone before running dry-run or execute mode." >&2
|
|
exit 3
|
|
;;
|
|
*)
|
|
echo "Upload operation failed with status: ${OPERATION_STATUS}" >&2
|
|
exit 1
|
|
;;
|
|
esac
|