diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..081ae2b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,6 @@ +node_modules +npm-debug.log +.git +.gitignore +.env +var diff --git a/.env.example b/.env.example index 8dfd1e9..3dda61f 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,40 @@ -NEAR_INTENTS_API_KEY=your_solver_jwt +# Local dev / container runtime values +NEAR_INTENTS_API_KEY=replace_me NEAR_INTENTS_WS_URL=wss://solver-relay-v2.chaindefuser.com/ws -KAFKA_BROKERS=127.0.0.1:9092 -KAFKA_CLIENT_ID=trading-system +KAFKA_BROKERS=redpanda:9092 +KAFKA_CLIENT_ID=unrip +KAFKA_TOPIC_RAW_NEAR_INTENTS_QUOTE=raw.near_intents.quote KAFKA_TOPIC_NORM_SWAP_DEMAND=norm.swap_demand +KAFKA_TOPIC_CMD_EXECUTE_TRADE=cmd.execute_trade +KAFKA_TOPIC_EXEC_TRADE_RESULT=exec.trade_result KAFKA_CONSUMER_GROUP_DUMMY=dummy-reactor-v1 +KAFKA_CONSUMER_GROUP_EXECUTOR=dummy-executor-v1 +EXECUTOR_STATE_DIR=/var/lib/unrip/executor-state + +# Repo-driven Hetzner bootstrap values live separately from the app .env. +# Copy scripts/hetzner/bootstrap-secrets.env.example to +# scripts/hetzner/bootstrap-secrets.env, fill in the values, then: +# source scripts/hetzner/bootstrap-secrets.env +# bash scripts/hetzner/bootstrap.sh +# +# The local-machine bootstrap flow is: +# 1. provide Hetzner token + SSH key path + DNS/ingress values + app/bootstrap secrets +# 2. run Terraform from infra/terraform/hetzner +# 3. wait for cloud-init/k3s readiness +# 4. fetch kubeconfig to .state/hetzner/kubeconfig.yaml +# 5. create Kubernetes Secrets from local values +# 6. build/import the current app image into k3s +# 7. apply repo Kubernetes manifests and let the bootstrap job create topics +# +# Expected bootstrap inputs: +# - HCLOUD_TOKEN +# - SSH_PUBLIC_KEY_PATH +# - TF_ADMIN_CIDR_BLOCKS +# - BASE_DOMAIN +# - FORGEJO_DOMAIN +# - FORGEJO_ROOT_URL +# - NEAR_INTENTS_API_KEY +# - FORGEJO_RUNNER_REGISTRATION_TOKEN +# +# Future k3s deployment should source the app values from Kubernetes Secret/ConfigMap. +# Hetzner bootstrap path clones the repo to /opt/unrip/repo for later deploy/k8s assets. diff --git a/.forgejo/workflows/deploy.yml b/.forgejo/workflows/deploy.yml new file mode 100644 index 0000000..4ae6953 --- /dev/null +++ b/.forgejo/workflows/deploy.yml @@ -0,0 +1,69 @@ +name: deploy + +on: + push: + branches: + - main + +jobs: + deploy: + runs-on: linux-amd64 + env: + IMAGE_TAG: ${{ github.sha }} + REGISTRY_HOST: ${{ vars.REGISTRY_HOST }} + PROJECT_NAME: ${{ vars.PROJECT_NAME || 'unrip' }} + PROJECT_NAMESPACE: ${{ vars.PROJECT_NAMESPACE || vars.PROJECT_NAME || 'unrip' }} + PROJECT_DEPLOYMENTS: ${{ vars.PROJECT_DEPLOYMENTS || 'near-intents-ingest,dummy-reactor,dummy-executor,dummy-consumer' }} + BUILDAH_ISOLATION: chroot + STORAGE_DRIVER: vfs + steps: + - name: Install tooling + run: | + apk add --no-cache buildah kubectl + + - name: Checkout + env: + REPO_URL: ${{ github.server_url }}/${{ github.repository }}.git + REPO_TOKEN: ${{ github.token }} + run: | + rm -rf .git + git init . + git remote add origin "https://oauth2:${REPO_TOKEN}@${REPO_URL#https://}" + git fetch --depth=1 origin "$GITHUB_SHA" + git checkout --detach FETCH_HEAD + + - name: Load kubeconfig + run: | + mkdir -p "$HOME/.kube" + printf '%s' '${{ secrets.KUBECONFIG_B64 }}' | base64 -d > "$HOME/.kube/config" + kubectl get ns + + - name: Login to registry + run: | + buildah login -u '${{ secrets.REGISTRY_USERNAME }}' -p '${{ secrets.REGISTRY_PASSWORD }}' "$REGISTRY_HOST" + + - name: Resolve deployment settings + run: | + IMAGE="$REGISTRY_HOST/$PROJECT_NAME:$IMAGE_TAG" + { + echo "IMAGE=$IMAGE" + echo "PROJECT_NAMESPACE=$PROJECT_NAMESPACE" + echo "PROJECT_DEPLOYMENTS=$PROJECT_DEPLOYMENTS" + } >> "$GITHUB_ENV" + + - name: Build and push image + run: | + buildah bud --storage-driver "$STORAGE_DRIVER" -t "$IMAGE" . + buildah push --storage-driver "$STORAGE_DRIVER" "$IMAGE" "docker://$IMAGE" + + - name: Roll deployments to new image + run: | + IFS=',' read -r -a DEPLOYMENTS <<< "$PROJECT_DEPLOYMENTS" + + for deployment in "${DEPLOYMENTS[@]}"; do + deployment="$(echo "$deployment" | xargs)" + [ -n "$deployment" ] || continue + + kubectl -n "$PROJECT_NAMESPACE" set image "deployment/$deployment" app="$IMAGE" + kubectl -n "$PROJECT_NAMESPACE" rollout status "deployment/$deployment" --timeout=180s + done diff --git a/.gitignore b/.gitignore index c1e8ad7..9523a32 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,14 @@ .ant-colony/ +.state/ .venv/ __pycache__/ *.pyc .env +deploy/k8s/overlays/hetzner-single-node/secrets/*.env +deploy/k8s/overlays/hetzner-single-node/secrets/*.htpasswd +!deploy/k8s/overlays/hetzner-single-node/secrets/*.example +scripts/hetzner/bootstrap-secrets.env +infra/terraform/hetzner/.terraform/ +infra/terraform/hetzner/.terraform.lock.hcl +infra/terraform/hetzner/terraform.tfstate +infra/terraform/hetzner/terraform.tfstate.* diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3451298 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM node:22-bookworm-slim +WORKDIR /app + +COPY package.json package-lock.json ./ +RUN npm ci --omit=dev + +COPY . . + +ENV NODE_ENV=production +CMD ["node", "src/apps/dummy-consumer.mjs"] diff --git a/README.md b/README.md index cac6315..ecce6f1 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,21 @@ # near-intents-monitor -Minimal event-driven POC for the first trading-system component: +Production-shaped first slice of the trading system: - **venue ingest**: NEAR Intents solver-bus quote flow -- **central bus**: Redpanda / Kafka-compatible broker -- **dummy reactor**: placeholder consumer for later trade-decision logic +- **bus**: Redpanda first, Kafka-compatible by design +- **reactor**: dummy decision engine emitting commands +- **executor**: dummy execution worker with durable idempotency state +- **result consumer**: downstream observer of execution outcomes -## Architecture - -```text -NEAR Intents WebSocket - | - v -src/apps/near-intents-ingest.mjs - | - +--> raw.near_intents.quote - | - +--> norm.swap_demand - | - v - src/apps/dummy-consumer.mjs -``` - -The ingest app connects to the NEAR Intents websocket, subscribes to `quote` and `quote_status`, normalizes quote demand, and publishes to a Kafka-compatible topic. - -## Project structure +## Canonical repo shape ```text src/ apps/ near-intents-ingest.mjs + dummy-reactor.mjs + dummy-executor.mjs dummy-consumer.mjs bus/ kafka/ @@ -37,98 +23,346 @@ src/ consumer.mjs core/ event-envelope.mjs + executor-state-store.mjs log.mjs pair-filter.mjs + schemas.mjs lib/ - env.mjs config.mjs + env.mjs venues/ near-intents/ ingest.mjs normalize.mjs ws.mjs +compose.yml +Dockerfile +docs/contracts.md +deploy/hetzner/README.md ``` -## Environment +## Event flow -Create `.env` in repo root: - -```env -NEAR_INTENTS_API_KEY=your_solver_jwt -NEAR_INTENTS_WS_URL=wss://solver-relay-v2.chaindefuser.com/ws -KAFKA_BROKERS=127.0.0.1:9092 -KAFKA_CLIENT_ID=trading-system -KAFKA_TOPIC_NORM_SWAP_DEMAND=norm.swap_demand -KAFKA_CONSUMER_GROUP_DUMMY=dummy-reactor-v1 +```text +NEAR Intents WebSocket + | + +--> raw.near_intents.quote + | + v +norm.swap_demand + | + v +cmd.execute_trade + | + v +exec.trade_result ``` -### Broker notes +Core rule: services do not call each other directly for trading flow; they communicate through bus topics only. -- `KAFKA_BROKERS` accepts a comma-separated broker list. -- Redpanda works because the apps use the Kafka protocol via `kafkajs`. -- `src/lib/config.mjs` is the shared config loader for both app entrypoints. -- The ingest app publishes normalized quote-demand events to `norm.swap_demand` by default. +## Contracts +See `docs/contracts.md`. -## Install +Current topics: +- `raw.near_intents.quote` +- `norm.swap_demand` +- `cmd.execute_trade` +- `exec.trade_result` + +## Primary deployment path: repo-driven Hetzner bootstrap + +The primary production path is no longer a Compose-only VM workflow. + +The intended operating model is: +- Terraform provisions a Hetzner single-node environment +- cloud-init installs k3s automatically on first boot +- a local operator workstation performs the first repo-driven bootstrap +- Kubernetes manifests install Redpanda, the app workloads, Forgejo, runner, registry, and ingress-related components +- once the in-cluster Git + CI stack is alive, routine app deploys move to self-hosted CI + +This is a two-phase model: +- **Phase 0:** local workstation bootstrap of a brand-new cluster +- **Phase 1:** self-hosted Forgejo + runner takes over app delivery + +Compose still exists for local development and optional single-machine testing, but it is not the canonical production story. + +## Prerequisites for first deployment + +Install locally on the operator workstation: +- Terraform `>= 1.6` +- `kubectl` +- `docker` +- `curl` + +You also need: +- a Hetzner Cloud API token +- a local SSH public key file for Terraform node provisioning +- DNS control for your chosen base domain and Forgejo hostname +- preferably a Tailscale tailnet and auth key for private admin/control-plane access +- the repo checked out locally + +## Required bootstrap secrets and inputs + +Create the bootstrap env file: + +```bash +cp scripts/hetzner/bootstrap-secrets.env.example scripts/hetzner/bootstrap-secrets.env +``` + +Set at least: +- `HCLOUD_TOKEN` +- `SSH_PUBLIC_KEY_PATH` +- `PUBLIC_DOMAIN` +- recommended: + - `TAILSCALE_AUTH_KEY` + - `TAILSCALE_CONTROL_PLANE_HOSTNAME` +- optional fallback: + - `TF_ADMIN_CIDR_BLOCKS` +- `BASE_DOMAIN` +- `FORGEJO_DOMAIN` +- `FORGEJO_ROOT_URL` +- `REGISTRY_DOMAIN` +- `LETSENCRYPT_EMAIL` +- `REGISTRY_USERNAME` +- `REGISTRY_PASSWORD` +- `NEAR_INTENTS_API_KEY` +- `FORGEJO_RUNNER_REGISTRATION_TOKEN` +- optional DNS automation: + - Cloudflare: + - `CLOUDFLARE_API_TOKEN` + - `CLOUDFLARE_ZONE_ID` + - Porkbun: + - `PORKBUN_API_KEY` + - `PORKBUN_SECRET_API_KEY` + +Then load them: + +```bash +source scripts/hetzner/bootstrap-secrets.env +``` + +## First bootstrap sequence + +Run the end-to-end bootstrap from repo root: + +```bash +bash scripts/hetzner/bootstrap.sh +``` + +Current repo behavior of that script: +1. runs Terraform in `infra/terraform/hetzner` +2. optionally creates DNS records for the base, Forgejo, and registry hosts via Cloudflare or Porkbun +3. if configured, joins the node to Tailscale and prefers the Tailscale control-plane hostname for Kubernetes API access +4. waits for SSH and the k3s API endpoint to become ready +5. fetches the real k3s kubeconfig from the node and writes it to `.state/hetzner/kubeconfig.yaml` +6. renders the Hetzner single-node overlay from local operator inputs +7. creates registry pull/auth secrets +8. applies the Kubernetes bootstrap manifests +9. builds the app image locally and imports it into k3s on the node +10. performs the first rollout using the imported bootstrap image + +Use the generated kubeconfig afterward: + +```bash +export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml +kubectl get nodes -o wide +kubectl get pods -A +kubectl -n unrip get deploy,pods +kubectl -n forgejo get deploy,pods,svc +``` + +## What is deployed into k3s + +The repo-managed Kubernetes assets are under `deploy/k8s/`. + +Current single-node target includes resources for: +- `unrip` workloads in namespace `unrip` +- Redpanda +- Forgejo +- Forgejo runner +- private registry +- ingress-nginx namespace/resources +- cert-manager namespace/resources +- ACME issuers and ingress definitions +- a bootstrap job for Redpanda topic creation + +Shared platform namespaces: +- `forgejo` +- `registry` +- `ingress-nginx` +- `cert-manager` + +Project-specific namespaces: +- `unrip` +- future projects should get their own namespace rather than sharing `unrip` + +Important current-state nuance: +- the bootstrap script currently applies `deploy/k8s/base` +- the longer-term intended target is `deploy/k8s/overlays/hetzner-single-node` + +## Executor persistence in k3s + +The executor is stateful by design because it persists idempotency/execution tracking. + +Current persistence boundary: +- app env uses `EXECUTOR_STATE_DIR=/var/lib/unrip/executor-state` +- in Kubernetes, the executor deployment mounts storage at that path +- the Hetzner single-node overlay pins storage to the k3s `local-path` storage class +- cloud-init also prepares the host directory boundary for executor state on first boot + +Operational meaning: +- executor state lives on node-backed storage in the single-node k3s environment +- if that PVC or underlying node storage is lost, duplicate-suppression history is lost too +- treat executor persistence as part of the minimal durable state of the cluster + +## Failure recovery and operator checks + +### If bootstrap fails before Terraform completes +Re-run after fixing the local input problem: +- missing token +- invalid CIDRs +- invalid SSH public key path + +If the infrastructure must be torn down: + +```bash +source scripts/hetzner/bootstrap-secrets.env +bash scripts/hetzner/destroy.sh +``` + +### If Terraform succeeds but Kubernetes is not ready +Check the public API and cluster state from the workstation: + +```bash +export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml +kubectl get nodes -o wide +kubectl get pods -A +kubectl get events -A --sort-by=.lastTimestamp | tail -n 50 +``` + +Typical next checks: +- cloud-init may still be finishing +- k3s may still be starting +- a workload may be crash-looping due to missing secret values or image-delivery issues + +### If workloads do not roll out +Inspect the affected namespace: + +```bash +kubectl -n unrip get pods +kubectl -n unrip describe pod +kubectl -n unrip logs deploy/dummy-executor --tail=100 +kubectl -n forgejo logs deploy/forgejo --tail=100 +``` + +### If you need to recreate secrets +The workstation bootstrap creates these Secrets: +- `unrip/unrip-secrets` +- `forgejo/forgejo-secrets` + +Verify them: + +```bash +kubectl -n unrip get secret unrip-secrets +kubectl -n forgejo get secret forgejo-secrets +``` + +### Current known limitations +Current colony state already identified an important gap: +- bootstrap and CI are not yet fully production-hardened, even though the first deploy path now fetches the real kubeconfig and imports the bootstrap image directly into k3s + +Treat the current bootstrap as a repo-driven first-deploy path suitable for testing, with hardening still pending. + +## Self-hosted CI handoff + +After cluster bootstrap: +- open Forgejo at `https://${FORGEJO_DOMAIN}` +- seed or push this repo into Forgejo +- create Forgejo repository secrets: + - `KUBECONFIG_B64` + - `REGISTRY_USERNAME` + - `REGISTRY_PASSWORD` +- create Forgejo repository variables: + - `REGISTRY_HOST=${REGISTRY_DOMAIN}` + - optional: `PROJECT_NAME=unrip` + - optional: `PROJECT_NAMESPACE=unrip` + - optional: `PROJECT_DEPLOYMENTS=near-intents-ingest,dummy-reactor,dummy-executor,dummy-consumer` +- push to `main` + +Routine application deploys then follow `.forgejo/workflows/deploy.yml`: +- build image as `REGISTRY_HOST/PROJECT_NAME:${GIT_SHA}` +- push to the private registry +- `kubectl set image` for each deployment listed in `PROJECT_DEPLOYMENTS` inside `PROJECT_NAMESPACE` +- wait for rollout + +If project variables are omitted, the workflow defaults to the current repo project: +- `PROJECT_NAME=unrip` +- `PROJECT_NAMESPACE=unrip` +- `PROJECT_DEPLOYMENTS=near-intents-ingest,dummy-reactor,dummy-executor,dummy-consumer` + +Infrastructure changes remain Terraform-driven from the operator workstation unless and until that responsibility is also automated. + +For the detailed operator runbooks, see: +- `docs/hetzner-k3s-bootstrap.md` +- `docs/hetzner-self-hosted-ci-runbook.md` +- `deploy/k8s/projects/README.md` +- `docs/next-session-architecture.md` + +## Local development with Compose + +Compose remains available for local development and debugging. ```bash npm install +cp .env.example .env +# edit .env + +docker compose build +docker compose up -d ``` -## Run +Useful commands: -### Start NEAR Intents ingest - -Use the package script: +```bash +docker compose ps +docker compose logs -f +docker compose logs -f near-intents-ingest dummy-reactor dummy-executor dummy-consumer +docker compose restart dummy-executor +docker compose down +docker compose down -v +``` +### Individual services ```bash npm run near-intents:ingest +npm run dummy-reactor +npm run dummy-executor +npm run dummy-consumer ``` -Or run the app directly: - -```bash -node src/apps/near-intents-ingest.mjs -``` - -Optional exact-pair filter: - +Optional pair filter: ```bash npm run near-intents:ingest -- --pair 'asset_a->asset_b' ``` -Example: +## Idempotent executor behavior +- every command has a `command_id` +- commands carry `idempotency_key` and `execution_key` +- executor persists state under `EXECUTOR_STATE_DIR` +- completed commands are skipped after restart or replay -```bash -npm run near-intents:ingest -- --pair 'nep141:btc.omft.near->nep141:gnosis-0x420ca0f9b9b604ce0fd9c18ef134c705e5fa3430.omft.near' -``` +## Env -The filter is direction-agnostic, so `asset_a->asset_b` also matches `asset_b->asset_a`. - -### Start the dummy consumer - -Use the package script: - -```bash -npm run dummy-consumer -``` - -Or run the app directly: - -```bash -node src/apps/dummy-consumer.mjs -``` - -The dummy consumer subscribes to `norm.swap_demand`, logs the observed pair and quote id, and stands in for a future decision engine. - -## Scripts - -- `npm run near-intents:ingest` — start the websocket ingest and publish to Kafka/Redpanda topics -- `npm run dummy-consumer` — consume normalized demand events -- `npm start` — legacy wrapper that forwards into the ingest app - -## Notes - -- This repo is now bus-first: venue intake and downstream reaction are decoupled through Kafka-compatible topics. -- `index.mjs` remains only as a compatibility launch wrapper; operational docs should prefer `src/apps/*` entrypoints and npm scripts. -- Older single-file, Python, or TUI-only runtime instructions are obsolete for this repository state. +```env +NEAR_INTENTS_API_KEY=your_solver_jwt +NEAR_INTENTS_WS_URL=wss://solver-relay-v2.chaindefuser.com/ws +KAFKA_BROKERS=redpanda:9092 +KAFKA_CLIENT_ID=unrip +KAFKA_TOPIC_RAW_NEAR_INTENTS_QUOTE=raw.near_intents.quote +KAFKA_TOPIC_NORM_SWAP_DEMAND=norm.swap_demand +KAFKA_TOPIC_CMD_EXECUTE_TRADE=cmd.execute_trade +KAFKA_TOPIC_EXEC_TRADE_RESULT=exec.trade_result +KAFKA_CONSUMER_GROUP_DUMMY=dummy-reactor-v1 +KAFKA_CONSUMER_GROUP_EXECUTOR=dummy-executor-v1 +EXECUTOR_STATE_DIR=/var/lib/unrip/executor-state +``` \ No newline at end of file diff --git a/compose.yml b/compose.yml new file mode 100644 index 0000000..62809db --- /dev/null +++ b/compose.yml @@ -0,0 +1,81 @@ +# Local/dev runtime reference. Hetzner production bootstrap now starts from Terraform + cloud-init + k3s. +services: + redpanda: + image: docker.redpanda.com/redpandadata/redpanda:v24.3.9 + command: + - redpanda + - start + - --overprovisioned + - --smp + - "1" + - --memory + - "1G" + - --reserve-memory + - "0M" + - --node-id + - "0" + - --check=false + - --kafka-addr + - internal://0.0.0.0:9092,external://0.0.0.0:19092 + - --advertise-kafka-addr + - internal://redpanda:9092,external://127.0.0.1:19092 + - --pandaproxy-addr + - internal://0.0.0.0:8082 + - --advertise-pandaproxy-addr + - internal://redpanda:8082 + ports: + - "127.0.0.1:19092:19092" + volumes: + - redpanda-data:/var/lib/redpanda/data + healthcheck: + test: ["CMD-SHELL", "rpk cluster health | grep -q 'Healthy: *true'"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 20s + + near-intents-ingest: + build: . + command: ["node", "src/apps/near-intents-ingest.mjs"] + env_file: + - .env + depends_on: + redpanda: + condition: service_healthy + restart: unless-stopped + + dummy-reactor: + build: . + command: ["node", "src/apps/dummy-reactor.mjs"] + env_file: + - .env + depends_on: + redpanda: + condition: service_healthy + restart: unless-stopped + + dummy-executor: + build: . + command: ["node", "src/apps/dummy-executor.mjs"] + env_file: + - .env + depends_on: + redpanda: + condition: service_healthy + restart: unless-stopped + volumes: + - executor-state:/var/lib/unrip/executor-state + + dummy-consumer: + build: . + command: ["node", "src/apps/dummy-consumer.mjs"] + env_file: + - .env + depends_on: + redpanda: + condition: service_healthy + restart: unless-stopped + +volumes: + redpanda-data: + executor-state: diff --git a/deploy/hetzner/README.md b/deploy/hetzner/README.md new file mode 100644 index 0000000..381321d --- /dev/null +++ b/deploy/hetzner/README.md @@ -0,0 +1,275 @@ +# Hetzner single-node bootstrap (Terraform + cloud-init + k3s) + +This is the canonical first-production deployment path for the repo. + +A local operator workstation drives the first deployment end to end: +- Terraform provisions Hetzner infrastructure +- cloud-init installs k3s automatically on first boot +- the workstation waits for the public Kubernetes API +- the workstation creates initial Kubernetes Secrets +- the workstation applies repo-managed Kubernetes manifests +- the workstation performs the first image/bootstrap delivery attempt +- once Forgejo + runner are alive, routine app deploys are intended to move to self-hosted CI + +Compose remains available for local development, but it is not the primary production deployment model. + +## Scope of this layer + +The foundation under `infra/terraform/hetzner` provisions: +- one Hetzner Cloud server +- one SSH key resource based on your local public key +- firewall rules for SSH, Kubernetes API, and HTTP/HTTPS ingress +- a private network attachment for future growth +- cloud-init user-data for unattended k3s installation and host preparation + +The repo bootstrap then applies the Hetzner single-node overlay under `deploy/k8s/overlays/hetzner-single-node`, which composes Kubernetes resources under `deploy/k8s/` for: +- shared platform namespaces and services +- Redpanda +- unrip workloads +- Forgejo +- Forgejo runner +- private registry +- ingress/TLS-related resources +- Redpanda topic bootstrap job + +## Prerequisites + +Install on the operator workstation: +- Terraform `>= 1.6` +- `kubectl` +- `docker` +- `curl` + +You also need: +- a Hetzner Cloud API token +- an SSH keypair already present locally +- access to DNS for your chosen domains +- admin CIDRs that can reach the future server on `22/tcp` and `6443/tcp` +- this repo checked out locally + +## Required bootstrap secrets and inputs + +Prepare the operator env file: + +```bash +cp scripts/hetzner/bootstrap-secrets.env.example scripts/hetzner/bootstrap-secrets.env +${EDITOR:-vi} scripts/hetzner/bootstrap-secrets.env +``` + +Set at least: +- `HCLOUD_TOKEN` +- `SSH_PUBLIC_KEY_PATH` +- `TF_ADMIN_CIDR_BLOCKS` +- `BASE_DOMAIN` +- `FORGEJO_DOMAIN` +- `FORGEJO_ROOT_URL` +- `NEAR_INTENTS_API_KEY` +- `FORGEJO_RUNNER_REGISTRATION_TOKEN` + +Load it into the current shell: + +```bash +source scripts/hetzner/bootstrap-secrets.env +``` + +## Canonical bootstrap sequence + +Run from repo root: + +```bash +bash scripts/hetzner/bootstrap.sh +``` + +Current behavior of the script: +1. validates local tooling +2. runs `terraform init` and `terraform apply` in `infra/terraform/hetzner` +3. reads Terraform outputs such as server IP and `k3s_api_url` +4. waits for the k3s API readiness endpoint +5. writes a local workstation kubeconfig to `.state/hetzner/kubeconfig.yaml` +6. writes overlay secret env input files and creates: + - `unrip/unrip-secrets` + - `unrip/unrip-registry-creds` + - `forgejo/forgejo-secrets` + - `registry/registry-secrets` +7. applies `deploy/k8s/platform/base/namespace.yaml` and `deploy/k8s/overlays/hetzner-single-node` +8. builds the repo bootstrap image locally +9. pushes it through the temporary local registry bridge using the active project name +10. updates and waits for rollout status in the active project namespace + +After the script finishes: + +```bash +export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml +kubectl get nodes -o wide +kubectl get pods -A +kubectl -n unrip get deploy,pods,jobs +kubectl -n forgejo get deploy,pods,svc +kubectl -n registry get pods,svc +``` + +## Current manifest target + +Important current-state detail: +- `scripts/hetzner/bootstrap.sh` now applies `deploy/k8s/platform/base/namespace.yaml` +- it then applies `deploy/k8s/overlays/hetzner-single-node` +- bootstrap naming no longer assumes legacy `trading-system` kubeconfig contexts, image tags, or rollout namespaces + +## Executor persistence in k3s + +The dummy executor persists durable idempotency state. + +Current persistence model: +- application path: `EXECUTOR_STATE_DIR=/var/lib/unrip/executor-state` +- cloud-init prepares the host boundary for executor storage on first boot +- Kubernetes mounts storage at that same path for the executor workload +- the Hetzner single-node overlay pins PVC-backed storage to k3s `local-path` + +Operational consequence: +- executor duplicate-suppression state lives on node-backed persistent storage +- replacing the node or deleting the PVC without migration loses that history +- treat executor state as required operational data, even though the executor is still a dummy implementation + +## Failure recovery runbook + +### A. Bootstrap fails before infrastructure exists +Typical causes: +- invalid `HCLOUD_TOKEN` +- wrong `SSH_PUBLIC_KEY_PATH` +- malformed `TF_ADMIN_CIDR_BLOCKS` + +Fix the input and rerun: + +```bash +source scripts/hetzner/bootstrap-secrets.env +bash scripts/hetzner/bootstrap.sh +``` + +If you need to destroy partially created infrastructure: + +```bash +source scripts/hetzner/bootstrap-secrets.env +bash scripts/hetzner/destroy.sh +``` + +### B. Terraform succeeds but cluster access is not usable +Verify the generated kubeconfig and cluster health: + +```bash +export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml +kubectl get nodes -o wide +kubectl get pods -A +kubectl get events -A --sort-by=.lastTimestamp | tail -n 50 +``` + +What to suspect first: +- cloud-init still running +- k3s still starting +- bootstrap kubeconfig/auth not fully aligned yet +- public API reachable, but workloads not yet healthy + +### C. Secrets were wrong or missing +The current bootstrap depends on: +- `${PROJECT_NAME:-unrip}-secrets` + - `NEAR_INTENTS_API_KEY` +- `forgejo-secrets` + - `root_url` + - `domain` + - `runner_registration_token` + +Verify: + +```bash +kubectl -n unrip get secret unrip-secrets +kubectl -n unrip get secret unrip-registry-creds +kubectl -n forgejo get secret forgejo-secrets +kubectl -n registry get secret registry-secrets +``` + +If needed, recreate them from the workstation before restarting the affected deployments. + +### D. Workloads are present but not healthy +Inspect by namespace: + +```bash +kubectl -n unrip get pods +kubectl -n unrip describe pod +kubectl -n unrip logs deploy/dummy-executor --tail=100 +kubectl -n forgejo logs deploy/forgejo --tail=100 +kubectl -n forgejo logs deploy/forgejo-runner --tail=100 +``` + +Useful rollout checks: + +```bash +kubectl -n unrip rollout status deployment/near-intents-ingest --timeout=300s +kubectl -n unrip rollout status deployment/dummy-reactor --timeout=300s +kubectl -n unrip rollout status deployment/dummy-executor --timeout=300s +kubectl -n unrip rollout status deployment/dummy-consumer --timeout=300s +kubectl -n forgejo rollout status deployment/forgejo --timeout=300s +kubectl -n forgejo rollout status deployment/forgejo-runner --timeout=300s +``` + +### E. Need to inspect Terraform outputs directly + +```bash +cd infra/terraform/hetzner +terraform output +terraform output server_ipv4 +terraform output server_private_ipv4 +terraform output k3s_api_url +terraform output kubeconfig_strategy +``` + +## Self-hosted CI handoff + +After the cluster is reachable and workloads are up: +1. reach Forgejo at the configured domain or by port-forward +2. perform the initial admin/bootstrap steps in Forgejo +3. create the target repository in Forgejo +4. push or mirror this repo into that Forgejo instance +5. confirm the runner is registered and healthy +6. move routine application deploys to the self-hosted pipeline, which now derives image naming and rollout targets from Forgejo repository variables instead of hard-coding the legacy project + +Current repo-state caveats already known: +- first bootstrap is repo-driven from the workstation +- the bootstrap path no longer relies on SSH/scp transport in control flow +- the kubeconfig/auth result is not yet fully production-hardened +- first rollout still uses a temporary local registry bridge; routine CI deploys are intended to be registry-native and the Forgejo workflow now defaults to `unrip` while allowing per-repo overrides for image name, namespace, and deployment list +- Forgejo admin creation, repo creation, and Actions configuration still require operator action after cluster bring-up +- DNS automation is currently wired for Cloudflare when credentials are supplied during bootstrap +- TLS is expected to come from cert-manager + Let's Encrypt once ingress hostnames resolve publicly + +## Terraform-only usage + +If you only want the infra layer: + +```bash +cd infra/terraform/hetzner +export TF_VAR_hcloud_token="" +export TF_VAR_ssh_public_key="$(cat ~/.ssh/id_ed25519.pub)" +export TF_VAR_admin_cidr_blocks='["203.0.113.10/32"]' + +terraform init +terraform apply +``` + +Useful outputs: +- `server_ipv4` +- `server_private_ipv4` +- `server_name` +- `server_fqdn` +- `k3s_api_url` +- `kubeconfig_strategy` + +For CI/CD details, also see: +- `docs/hetzner-k3s-bootstrap.md` +- `docs/hetzner-self-hosted-ci-runbook.md` + +## Compose status + +Compose is still useful for: +- local development +- fast topology debugging +- non-production single-machine testing + +But it should be treated as optional/dev runtime support, not as the primary production deployment path. diff --git a/deploy/hetzner/cloud-init.k3s-first-node.yaml b/deploy/hetzner/cloud-init.k3s-first-node.yaml new file mode 100644 index 0000000..fbb8f9b --- /dev/null +++ b/deploy/hetzner/cloud-init.k3s-first-node.yaml @@ -0,0 +1,115 @@ +#cloud-config +package_update: true +package_upgrade: true +packages: + - ca-certificates + - curl + - git + - gnupg + - jq + - nfs-common + - open-iscsi + - apt-transport-https + - software-properties-common + - unattended-upgrades + - ufw + +write_files: + - path: /etc/sysctl.d/90-k3s-single-node.conf + permissions: '0644' + owner: root:root + content: | + vm.max_map_count=1048575 + fs.inotify.max_user_instances=8192 + fs.inotify.max_user_watches=1048576 + fs.file-max=1048576 + net.core.somaxconn=65535 + net.ipv4.ip_local_port_range=1024 65535 + net.ipv4.tcp_tw_reuse=1 + - path: /etc/rancher/k3s/config.yaml + permissions: '0600' + owner: root:root + content: | + write-kubeconfig-mode: "0640" + kube-apiserver-arg: + - anonymous-auth=false + protect-kernel-defaults: true + disable: + - traefik + - path: /usr/local/bin/post-k3s-bootstrap.sh + permissions: '0755' + owner: root:root + content: | + #!/usr/bin/env bash + set -euo pipefail + + install -d -m 0755 /var/lib/redpanda/data + install -d -m 0755 /var/lib/unrip/executor-state + chown root:root /var/lib/redpanda/data /var/lib/unrip/executor-state + + systemctl enable --now iscsid || true + + export KUBECONFIG=/etc/rancher/k3s/k3s.yaml + curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + + install -d -m 0755 /usr/local/share/unrip + cat >/usr/local/share/unrip/bootstrap-metadata.env <<'EOF' + BOOTSTRAP_MODE=k3s-single-node + BOOTSTRAP_PROJECT_NAME=unrip + BOOTSTRAP_PROJECT_NAMESPACE=unrip + K3S_KUBECONFIG=/opt/bootstrap/kubeconfig-internal.yaml + BOOTSTRAP_REPO_DIR=/opt/unrip/repo + BOOTSTRAP_MANIFEST_DIR=/opt/unrip/repo/deploy/k8s + GITOPS_HANDOFF=seed-self-hosted-git-and-runner + EOF + chmod 0644 /usr/local/share/unrip/bootstrap-metadata.env + + install -d -m 0755 /opt/unrip + if [ ! -d /opt/unrip/repo/.git ]; then + git clone --depth 1 ${BOOTSTRAP_REPO_URL:-https://example.invalid/bootstrap-repo.git} /opt/unrip/repo || true + fi + + install -d -m 0755 /opt/bootstrap + cp /etc/rancher/k3s/k3s.yaml /opt/bootstrap/kubeconfig-internal.yaml + chmod 0640 /opt/bootstrap/kubeconfig-internal.yaml + chgrp k3s-readers /opt/bootstrap/kubeconfig-internal.yaml + sed -i 's/127.0.0.1/{{PRIVATE_IPV4}}/' /opt/bootstrap/kubeconfig-internal.yaml + + cat >/opt/bootstrap/README.txt <<'EOF' + This node was provisioned by Terraform + cloud-init. + Use /opt/bootstrap/kubeconfig-internal.yaml for automation. + Bootstrap metadata lives at /usr/local/share/unrip/bootstrap-metadata.env. + Future Kubernetes bootstrap assets should live under /opt/unrip/repo/deploy/k8s. + EOF + chmod 0644 /opt/bootstrap/README.txt + + if command -v kubectl >/dev/null 2>&1; then + kubectl get nodes -o wide >/opt/bootstrap/kubectl-get-nodes.txt + fi + + if id ubuntu >/dev/null 2>&1; then + usermod -aG k3s-readers ubuntu || true + install -d -o ubuntu -g ubuntu -m 0700 /home/ubuntu/.kube + cp /etc/rancher/k3s/k3s.yaml /home/ubuntu/.kube/config + chown ubuntu:ubuntu /home/ubuntu/.kube/config + chmod 0600 /home/ubuntu/.kube/config + sed -i 's/127.0.0.1/{{PRIVATE_IPV4}}/' /home/ubuntu/.kube/config + fi + +runcmd: + - sysctl --system + - systemctl enable unattended-upgrades + - systemctl enable --now ufw + - ufw default deny incoming + - ufw default allow outgoing + - ufw allow 22/tcp + - ufw allow 6443/tcp + - ufw allow 80/tcp + - ufw allow 443/tcp + - groupadd --system k3s-readers || true + - mkdir -p /etc/rancher/k3s + - curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC='server --cluster-init --tls-san {{PUBLIC_IPV4}} --node-ip {{PRIVATE_IPV4}} --advertise-address {{PRIVATE_IPV4}} --flannel-backend vxlan --disable servicelb' sh - + - cloud-init status --wait + - /usr/local/bin/post-k3s-bootstrap.sh + +final_message: "cloud-init finished: k3s first node bootstrapped" diff --git a/deploy/k8s/README.md b/deploy/k8s/README.md new file mode 100644 index 0000000..6cb6803 --- /dev/null +++ b/deploy/k8s/README.md @@ -0,0 +1,42 @@ +# Kubernetes bootstrap assets + +This directory is the repo-driven deployment target for the single-node Hetzner+k3s bootstrap. + +## Layout +- `base/` — shared bootstrap manifests plus the current `unrip` project manifests +- `projects/` — conventions for hosting multiple isolated projects on the same cluster +- `overlays/hetzner-single-node/` — first-node overlay with concrete hostnames, local-path storage, and generated secret references +- `secrets/` — examples and instructions for supplying required secrets out-of-band + +## Shared cluster model +Shared platform namespaces: +- `forgejo` +- `registry` +- `ingress-nginx` +- `cert-manager` + +Project-specific namespaces: +- `unrip` +- future projects should get their own namespace instead of sharing `unrip` + +## Apply flow +After Terraform/cloud-init has produced a working kubeconfig, the canonical path is: + +```bash +bash scripts/hetzner/bootstrap.sh +``` + +That script renders the Hetzner overlay inputs, creates platform and project registry auth secrets using the active project naming, and applies: + +```bash +kubectl apply -k deploy/k8s/overlays/hetzner-single-node +``` + +## Secret management +The overlay intentionally references generated or pre-created Secrets instead of committing credentials: +- `unrip/unrip-secrets` +- `unrip/unrip-registry-creds` +- `forgejo/forgejo-secrets` +- `registry/registry-secrets` + +The bootstrap script creates them from local environment variables. By default it targets the `unrip` project, but its kubeconfig context name, bootstrap image tag, project secret env filename, project namespace, and project registry secret name are derived from `PROJECT_NAME`, `PROJECT_NAMESPACE`, and `CLUSTER_NAME` instead of hard-coding legacy `trading-system` values. diff --git a/deploy/k8s/base/kustomization.yaml b/deploy/k8s/base/kustomization.yaml new file mode 100644 index 0000000..28193ff --- /dev/null +++ b/deploy/k8s/base/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ../platform/base + - ../projects/unrip/base diff --git a/deploy/k8s/overlays/hetzner-single-node/README.md b/deploy/k8s/overlays/hetzner-single-node/README.md new file mode 100644 index 0000000..b50ee63 --- /dev/null +++ b/deploy/k8s/overlays/hetzner-single-node/README.md @@ -0,0 +1,35 @@ +# Hetzner single-node overlay + +This overlay turns the shared platform and `unrip` project bases into a concrete first-node bootstrap target for the Terraform-provisioned k3s VM. + +## Before apply +Create real secret material from the examples: + +```bash +cp deploy/k8s/overlays/hetzner-single-node/secrets/unrip.env.example deploy/k8s/overlays/hetzner-single-node/secrets/unrip.env +cp deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env.example deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env +cp deploy/k8s/overlays/hetzner-single-node/secrets/registry.htpasswd.example deploy/k8s/overlays/hetzner-single-node/secrets/registry.htpasswd +``` + +Update: +- ingress hosts in `ingress-hosts.patch.yaml` +- ACME email in `issuer-email.patch.yaml` +- project secret values in `secrets/unrip.env` +- Forgejo secret values in `secrets/forgejo.env` +- registry htpasswd in `secrets/registry.htpasswd` + +## Apply +```bash +kubectl apply -k deploy/k8s/overlays/hetzner-single-node +``` + +## What gets installed +- shared platform namespaces for registry, ingress, cert-manager, and Forgejo +- project namespace `unrip` +- Redpanda plus a topic bootstrap job inside `unrip` +- app worker deployments referencing `unrip-secrets` +- Forgejo and Forgejo runner referencing `forgejo-secrets` +- private registry protected by htpasswd from `registry-secrets` +- nginx ingress and ACME issuers for TLS + +For future projects, do not reuse `unrip`; create a new project namespace and matching `-config`, `-secrets`, and `-registry-creds` resources. diff --git a/deploy/k8s/overlays/hetzner-single-node/ingress-hosts.patch.yaml b/deploy/k8s/overlays/hetzner-single-node/ingress-hosts.patch.yaml new file mode 100644 index 0000000..a9d3999 --- /dev/null +++ b/deploy/k8s/overlays/hetzner-single-node/ingress-hosts.patch.yaml @@ -0,0 +1,43 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: forgejo + namespace: forgejo +spec: + tls: + - hosts: + - git.doran.133011.xyz + secretName: forgejo-tls + rules: + - host: git.doran.133011.xyz + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: forgejo + port: + number: 3000 +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: registry + namespace: registry +spec: + tls: + - hosts: + - registry.doran.133011.xyz + secretName: registry-tls + rules: + - host: registry.doran.133011.xyz + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: registry + port: + number: 5000 diff --git a/deploy/k8s/overlays/hetzner-single-node/issuer-email.patch.yaml b/deploy/k8s/overlays/hetzner-single-node/issuer-email.patch.yaml new file mode 100644 index 0000000..6001807 --- /dev/null +++ b/deploy/k8s/overlays/hetzner-single-node/issuer-email.patch.yaml @@ -0,0 +1,15 @@ +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-staging +spec: + acme: + email: letsencryptemailfordoran@133011.xyz +--- +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-production +spec: + acme: + email: letsencryptemailfordoran@133011.xyz diff --git a/deploy/k8s/overlays/hetzner-single-node/kustomization.yaml b/deploy/k8s/overlays/hetzner-single-node/kustomization.yaml new file mode 100644 index 0000000..62b8e42 --- /dev/null +++ b/deploy/k8s/overlays/hetzner-single-node/kustomization.yaml @@ -0,0 +1,24 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ../../platform/base + - ../../projects/unrip/base +patches: + - path: ingress-hosts.patch.yaml + - path: issuer-email.patch.yaml + - path: storage-class.patch.yaml +secretGenerator: + - name: unrip-secrets + namespace: unrip + envs: + - secrets/unrip.env + - name: forgejo-secrets + namespace: forgejo + envs: + - secrets/forgejo.env + - name: registry-secrets + namespace: registry + files: + - htpasswd=secrets/registry.htpasswd +generatorOptions: + disableNameSuffixHash: true diff --git a/deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env.example b/deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env.example new file mode 100644 index 0000000..cf698eb --- /dev/null +++ b/deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env.example @@ -0,0 +1,3 @@ +root_url=https://git.unrip-bootstrap.example.com/ +domain=git.unrip-bootstrap.example.com +runner_registration_token=replace-me diff --git a/deploy/k8s/overlays/hetzner-single-node/secrets/registry.htpasswd.example b/deploy/k8s/overlays/hetzner-single-node/secrets/registry.htpasswd.example new file mode 100644 index 0000000..e605f80 --- /dev/null +++ b/deploy/k8s/overlays/hetzner-single-node/secrets/registry.htpasswd.example @@ -0,0 +1 @@ +bootstrap:$2y$05$replace-with-bcrypt-htpasswd diff --git a/deploy/k8s/overlays/hetzner-single-node/secrets/unrip.env.example b/deploy/k8s/overlays/hetzner-single-node/secrets/unrip.env.example new file mode 100644 index 0000000..3185cdd --- /dev/null +++ b/deploy/k8s/overlays/hetzner-single-node/secrets/unrip.env.example @@ -0,0 +1 @@ +NEAR_INTENTS_API_KEY=replace-me diff --git a/deploy/k8s/overlays/hetzner-single-node/storage-class.patch.yaml b/deploy/k8s/overlays/hetzner-single-node/storage-class.patch.yaml new file mode 100644 index 0000000..bff2297 --- /dev/null +++ b/deploy/k8s/overlays/hetzner-single-node/storage-class.patch.yaml @@ -0,0 +1,31 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: redpanda-data + namespace: unrip +spec: + storageClassName: local-path +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: executor-state + namespace: unrip +spec: + storageClassName: local-path +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: forgejo-data + namespace: forgejo +spec: + storageClassName: local-path +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: registry-data + namespace: registry +spec: + storageClassName: local-path diff --git a/deploy/k8s/platform/base/cert-manager.yaml b/deploy/k8s/platform/base/cert-manager.yaml new file mode 100644 index 0000000..a0a7eb8 --- /dev/null +++ b/deploy/k8s/platform/base/cert-manager.yaml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cert-manager + namespace: cert-manager +spec: + replicas: 1 + selector: + matchLabels: + app: cert-manager + template: + metadata: + labels: + app: cert-manager + spec: + containers: + - name: cert-manager + image: quay.io/jetstack/cert-manager-controller:v1.17.1 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cert-manager-webhook + namespace: cert-manager +spec: + replicas: 1 + selector: + matchLabels: + app: cert-manager-webhook + template: + metadata: + labels: + app: cert-manager-webhook + spec: + containers: + - name: webhook + image: quay.io/jetstack/cert-manager-webhook:v1.17.1 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cert-manager-cainjector + namespace: cert-manager +spec: + replicas: 1 + selector: + matchLabels: + app: cert-manager-cainjector + template: + metadata: + labels: + app: cert-manager-cainjector + spec: + containers: + - name: cainjector + image: quay.io/jetstack/cert-manager-cainjector:v1.17.1 diff --git a/deploy/k8s/platform/base/cluster-issuers.yaml b/deploy/k8s/platform/base/cluster-issuers.yaml new file mode 100644 index 0000000..54ddeb2 --- /dev/null +++ b/deploy/k8s/platform/base/cluster-issuers.yaml @@ -0,0 +1,29 @@ +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-staging +spec: + acme: + email: ops@example.invalid + server: https://acme-staging-v02.api.letsencrypt.org/directory + privateKeySecretRef: + name: letsencrypt-staging-account-key + solvers: + - http01: + ingress: + ingressClassName: traefik +--- +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-production +spec: + acme: + email: ops@example.invalid + server: https://acme-v02.api.letsencrypt.org/directory + privateKeySecretRef: + name: letsencrypt-production-account-key + solvers: + - http01: + ingress: + ingressClassName: traefik diff --git a/deploy/k8s/platform/base/coredns-public-dns.patch.yaml b/deploy/k8s/platform/base/coredns-public-dns.patch.yaml new file mode 100644 index 0000000..8cf2129 --- /dev/null +++ b/deploy/k8s/platform/base/coredns-public-dns.patch.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: coredns + namespace: kube-system +data: + Corefile: | + .:53 { + errors + health + ready + kubernetes cluster.local in-addr.arpa ip6.arpa { + pods insecure + fallthrough in-addr.arpa ip6.arpa + } + hosts /etc/coredns/NodeHosts { + ttl 60 + reload 15s + fallthrough + } + prometheus :9153 + cache 30 + loop + reload + loadbalance + import /etc/coredns/custom/*.override + forward . 1.1.1.1 1.0.0.1 8.8.8.8 8.8.4.4 + } + import /etc/coredns/custom/*.server diff --git a/deploy/k8s/platform/base/coredns.yaml b/deploy/k8s/platform/base/coredns.yaml new file mode 100644 index 0000000..05be7fb --- /dev/null +++ b/deploy/k8s/platform/base/coredns.yaml @@ -0,0 +1,31 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: coredns + namespace: kube-system +data: + Corefile: | + .:53 { + errors + health + ready + kubernetes cluster.local in-addr.arpa ip6.arpa { + pods insecure + fallthrough in-addr.arpa ip6.arpa + } + hosts /etc/coredns/NodeHosts { + ttl 60 + reload 15s + fallthrough + } + prometheus :9153 + cache 30 + loop + reload + loadbalance + import /etc/coredns/custom/*.override + forward . 1.1.1.1 1.0.0.1 8.8.8.8 8.8.4.4 + } + import /etc/coredns/custom/*.server + NodeHosts: | + 10.30.1.10 unrip-1 diff --git a/deploy/k8s/platform/base/forgejo-rbac.yaml b/deploy/k8s/platform/base/forgejo-rbac.yaml new file mode 100644 index 0000000..dbe6068 --- /dev/null +++ b/deploy/k8s/platform/base/forgejo-rbac.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: forgejo-runner + namespace: forgejo +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: forgejo-runner-deployer +rules: + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "list", "watch", "patch", "update"] + - apiGroups: [""] + resources: ["pods", "pods/log", "services", "configmaps", "secrets"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: forgejo-runner-deployer +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: forgejo-runner-deployer +subjects: + - kind: ServiceAccount + name: forgejo-runner + namespace: forgejo diff --git a/deploy/k8s/platform/base/forgejo-runner.yaml b/deploy/k8s/platform/base/forgejo-runner.yaml new file mode 100644 index 0000000..a4d9db2 --- /dev/null +++ b/deploy/k8s/platform/base/forgejo-runner.yaml @@ -0,0 +1,47 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: forgejo-runner + namespace: forgejo +spec: + replicas: 1 + selector: + matchLabels: + app: forgejo-runner + template: + metadata: + labels: + app: forgejo-runner + spec: + serviceAccountName: forgejo-runner + restartPolicy: Always + containers: + - name: runner + image: code.forgejo.org/forgejo/runner:6.3.1 + securityContext: + runAsUser: 0 + runAsGroup: 0 + env: + - name: FORGEJO_INSTANCE_URL + valueFrom: + secretKeyRef: + name: forgejo-secrets + key: root_url + - name: FORGEJO_RUNNER_REGISTRATION_TOKEN + valueFrom: + secretKeyRef: + name: forgejo-secrets + key: runner_registration_token + command: ["/bin/sh", "-lc"] + args: + - >- + if [ ! -f /data/.runner ]; then + forgejo-runner register --no-interactive --name k3s-runner --instance "$FORGEJO_INSTANCE_URL" --token "$FORGEJO_RUNNER_REGISTRATION_TOKEN" --labels "linux-amd64:host"; + fi && + forgejo-runner daemon --config /data/.runner + volumeMounts: + - name: runner-data + mountPath: /data + volumes: + - name: runner-data + emptyDir: {} diff --git a/deploy/k8s/platform/base/forgejo.yaml b/deploy/k8s/platform/base/forgejo.yaml new file mode 100644 index 0000000..81171cc --- /dev/null +++ b/deploy/k8s/platform/base/forgejo.yaml @@ -0,0 +1,76 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: forgejo-data + namespace: forgejo +spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 20Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: forgejo + namespace: forgejo +spec: + replicas: 1 + selector: + matchLabels: + app: forgejo + template: + metadata: + labels: + app: forgejo + spec: + containers: + - name: forgejo + image: codeberg.org/forgejo/forgejo:10 + env: + - name: USER_UID + value: "1000" + - name: USER_GID + value: "1000" + - name: FORGEJO__server__ROOT_URL + valueFrom: + secretKeyRef: + name: forgejo-secrets + key: root_url + - name: FORGEJO__server__DOMAIN + valueFrom: + secretKeyRef: + name: forgejo-secrets + key: domain + - name: FORGEJO__security__INSTALL_LOCK + value: "true" + - name: FORGEJO__service__DISABLE_REGISTRATION + value: "true" + ports: + - name: http + containerPort: 3000 + - name: ssh + containerPort: 22 + volumeMounts: + - name: data + mountPath: /data + volumes: + - name: data + persistentVolumeClaim: + claimName: forgejo-data +--- +apiVersion: v1 +kind: Service +metadata: + name: forgejo + namespace: forgejo +spec: + selector: + app: forgejo + ports: + - name: http + port: 3000 + targetPort: 3000 + - name: ssh + port: 22 + targetPort: 22 diff --git a/deploy/k8s/platform/base/ingress-nginx.yaml b/deploy/k8s/platform/base/ingress-nginx.yaml new file mode 100644 index 0000000..51bd042 --- /dev/null +++ b/deploy/k8s/platform/base/ingress-nginx.yaml @@ -0,0 +1,73 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ingress-nginx-controller + namespace: ingress-nginx +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/component: controller + template: + metadata: + labels: + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/component: controller + spec: + serviceAccountName: default + containers: + - name: controller + image: registry.k8s.io/ingress-nginx/controller:v1.12.1 + args: + - /nginx-ingress-controller + - --ingress-class=nginx + - --controller-class=k8s.io/ingress-nginx + - --publish-service=$(POD_NAMESPACE)/ingress-nginx-controller + - --election-id=ingress-nginx-leader + - --enable-ssl-passthrough + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - name: http + containerPort: 80 + - name: https + containerPort: 443 + securityContext: + allowPrivilegeEscalation: true + capabilities: + add: ["NET_BIND_SERVICE"] + drop: ["ALL"] + readinessProbe: + httpGet: + path: /healthz + port: 10254 + livenessProbe: + httpGet: + path: /healthz + port: 10254 +--- +apiVersion: v1 +kind: Service +metadata: + name: ingress-nginx-controller + namespace: ingress-nginx +spec: + type: LoadBalancer + selector: + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/component: controller + ports: + - name: http + port: 80 + targetPort: 80 + - name: https + port: 443 + targetPort: 443 diff --git a/deploy/k8s/platform/base/ingress.yaml b/deploy/k8s/platform/base/ingress.yaml new file mode 100644 index 0000000..5d5158f --- /dev/null +++ b/deploy/k8s/platform/base/ingress.yaml @@ -0,0 +1,49 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: forgejo + namespace: forgejo + annotations: + cert-manager.io/cluster-issuer: letsencrypt-production +spec: + ingressClassName: traefik + tls: + - hosts: + - git.example.invalid + secretName: forgejo-tls + rules: + - host: git.example.invalid + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: forgejo + port: + number: 3000 +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: registry + namespace: registry + annotations: + cert-manager.io/cluster-issuer: letsencrypt-production +spec: + ingressClassName: traefik + tls: + - hosts: + - registry.example.invalid + secretName: registry-tls + rules: + - host: registry.example.invalid + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: registry + port: + number: 5000 diff --git a/deploy/k8s/platform/base/kustomization.yaml b/deploy/k8s/platform/base/kustomization.yaml new file mode 100644 index 0000000..55be84f --- /dev/null +++ b/deploy/k8s/platform/base/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - namespace.yaml + - forgejo.yaml + - forgejo-rbac.yaml + - forgejo-runner.yaml + - registry.yaml + - ingress.yaml + - cluster-issuers.yaml + - coredns.yaml diff --git a/deploy/k8s/platform/base/namespace.yaml b/deploy/k8s/platform/base/namespace.yaml new file mode 100644 index 0000000..eb8c814 --- /dev/null +++ b/deploy/k8s/platform/base/namespace.yaml @@ -0,0 +1,35 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: unrip + labels: + app.kubernetes.io/part-of: unrip + project.pi.io/type: project +--- +apiVersion: v1 +kind: Namespace +metadata: + name: forgejo + labels: + project.pi.io/type: platform +--- +apiVersion: v1 +kind: Namespace +metadata: + name: registry + labels: + project.pi.io/type: platform +--- +apiVersion: v1 +kind: Namespace +metadata: + name: ingress-nginx + labels: + project.pi.io/type: platform +--- +apiVersion: v1 +kind: Namespace +metadata: + name: cert-manager + labels: + project.pi.io/type: platform diff --git a/deploy/k8s/platform/base/registry.yaml b/deploy/k8s/platform/base/registry.yaml new file mode 100644 index 0000000..d408825 --- /dev/null +++ b/deploy/k8s/platform/base/registry.yaml @@ -0,0 +1,68 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: registry-data + namespace: registry +spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 20Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: registry + namespace: registry +spec: + replicas: 1 + selector: + matchLabels: + app: registry + template: + metadata: + labels: + app: registry + spec: + containers: + - name: registry + image: registry:2 + env: + - name: REGISTRY_HTTP_ADDR + value: 0.0.0.0:5000 + - name: REGISTRY_STORAGE_DELETE_ENABLED + value: "true" + - name: REGISTRY_AUTH + value: htpasswd + - name: REGISTRY_AUTH_HTPASSWD_REALM + value: Trading System Registry + - name: REGISTRY_AUTH_HTPASSWD_PATH + value: /auth/htpasswd + ports: + - containerPort: 5000 + volumeMounts: + - name: data + mountPath: /var/lib/registry + - name: auth + mountPath: /auth + readOnly: true + volumes: + - name: data + persistentVolumeClaim: + claimName: registry-data + - name: auth + secret: + secretName: registry-secrets +--- +apiVersion: v1 +kind: Service +metadata: + name: registry + namespace: registry +spec: + selector: + app: registry + ports: + - name: http + port: 5000 + targetPort: 5000 diff --git a/deploy/k8s/projects/README.md b/deploy/k8s/projects/README.md new file mode 100644 index 0000000..96690a6 --- /dev/null +++ b/deploy/k8s/projects/README.md @@ -0,0 +1,35 @@ +# Projects on the shared cluster + +This cluster is intended to host multiple independent projects. + +## Pattern +- shared platform namespaces: + - `forgejo` + - `registry` + - `ingress-nginx` + - `cert-manager` +- per-project namespaces: + - `unrip` + - future examples: `project-foo`, `project-bar` + +## How to add another project +For each new project, create a project manifest set similar to `deploy/k8s/base/unrip.yaml`: +- one namespace +- one project config map +- one secret name unique to the project +- one image pull secret unique to the project +- one executor/data PVC if needed +- deployments/services/ingresses only inside that namespace + +Recommended naming convention: +- namespace: project name, e.g. `unrip` +- config map: `-config` +- app secrets: `-secrets` +- pull secret: `-registry-creds` +- persistent host path/app state: `/var/lib//...` +- app image: `registry./:` + +## Current project in this repo +- project name: `unrip` +- namespace: `unrip` +- project manifest: `deploy/k8s/base/unrip.yaml` diff --git a/deploy/k8s/projects/unrip/base/bootstrap-job.yaml b/deploy/k8s/projects/unrip/base/bootstrap-job.yaml new file mode 100644 index 0000000..7d8895b --- /dev/null +++ b/deploy/k8s/projects/unrip/base/bootstrap-job.yaml @@ -0,0 +1,18 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: redpanda-topic-bootstrap + namespace: unrip +spec: + template: + spec: + restartPolicy: OnFailure + containers: + - name: bootstrap-topics + image: docker.redpanda.com/redpandadata/redpanda:v24.3.9 + command: ["/bin/sh", "-lc"] + args: + - >- + rpk topic create raw.near_intents.quote norm.swap_demand cmd.execute_trade exec.trade_result + --brokers redpanda.unrip.svc.cluster.local:9092 + --partitions 1 --replicas 1 || true diff --git a/deploy/k8s/projects/unrip/base/kustomization.yaml b/deploy/k8s/projects/unrip/base/kustomization.yaml new file mode 100644 index 0000000..7f4febd --- /dev/null +++ b/deploy/k8s/projects/unrip/base/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - redpanda.yaml + - unrip.yaml + - bootstrap-job.yaml diff --git a/deploy/k8s/projects/unrip/base/redpanda.yaml b/deploy/k8s/projects/unrip/base/redpanda.yaml new file mode 100644 index 0000000..d41c0fd --- /dev/null +++ b/deploy/k8s/projects/unrip/base/redpanda.yaml @@ -0,0 +1,91 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: redpanda-data + namespace: unrip +spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 20Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redpanda + namespace: unrip +spec: + replicas: 1 + selector: + matchLabels: + app: redpanda + template: + metadata: + labels: + app: redpanda + app.kubernetes.io/part-of: unrip + spec: + containers: + - name: redpanda + image: docker.redpanda.com/redpandadata/redpanda:v24.3.9 + args: + - redpanda + - start + - --overprovisioned + - --smp + - "1" + - --memory + - "1G" + - --reserve-memory + - "0M" + - --node-id + - "0" + - --check=false + - --set + - redpanda.auto_create_topics_enabled=false + - --kafka-addr + - internal://0.0.0.0:9092 + - --advertise-kafka-addr + - internal://redpanda.unrip.svc.cluster.local:9092 + - --pandaproxy-addr + - internal://0.0.0.0:8082 + - --advertise-pandaproxy-addr + - internal://redpanda.unrip.svc.cluster.local:8082 + ports: + - name: kafka + containerPort: 9092 + - name: proxy + containerPort: 8082 + readinessProbe: + tcpSocket: + port: 9092 + initialDelaySeconds: 10 + periodSeconds: 10 + livenessProbe: + tcpSocket: + port: 9092 + initialDelaySeconds: 30 + periodSeconds: 15 + volumeMounts: + - name: redpanda-data + mountPath: /var/lib/redpanda/data + volumes: + - name: redpanda-data + persistentVolumeClaim: + claimName: redpanda-data +--- +apiVersion: v1 +kind: Service +metadata: + name: redpanda + namespace: unrip +spec: + selector: + app: redpanda + ports: + - name: kafka + port: 9092 + targetPort: 9092 + - name: proxy + port: 8082 + targetPort: 8082 diff --git a/deploy/k8s/projects/unrip/base/unrip.yaml b/deploy/k8s/projects/unrip/base/unrip.yaml new file mode 100644 index 0000000..eeebae5 --- /dev/null +++ b/deploy/k8s/projects/unrip/base/unrip.yaml @@ -0,0 +1,152 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: unrip-config + namespace: unrip +data: + NEAR_INTENTS_WS_URL: wss://solver-relay-v2.chaindefuser.com/ws + KAFKA_BROKERS: redpanda.unrip.svc.cluster.local:9092 + KAFKA_CLIENT_ID: unrip + KAFKA_TOPIC_RAW_NEAR_INTENTS_QUOTE: raw.near_intents.quote + KAFKA_TOPIC_NORM_SWAP_DEMAND: norm.swap_demand + KAFKA_TOPIC_CMD_EXECUTE_TRADE: cmd.execute_trade + KAFKA_TOPIC_EXEC_TRADE_RESULT: exec.trade_result + KAFKA_CONSUMER_GROUP_DUMMY: dummy-reactor-v1 + KAFKA_CONSUMER_GROUP_EXECUTOR: dummy-executor-v1 + EXECUTOR_STATE_DIR: /var/lib/unrip/executor-state + PROJECT_NAME: unrip + PROJECT_NAMESPACE: unrip +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: executor-state + namespace: unrip +spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 5Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: near-intents-ingest + namespace: unrip +spec: + replicas: 1 + selector: + matchLabels: + app: near-intents-ingest + template: + metadata: + labels: + app: near-intents-ingest + app.kubernetes.io/part-of: unrip + spec: + imagePullSecrets: + - name: unrip-registry-creds + containers: + - name: app + image: ghcr.io/example/unrip:bootstrap + imagePullPolicy: IfNotPresent + command: ["node", "src/apps/near-intents-ingest.mjs"] + envFrom: + - configMapRef: + name: unrip-config + - secretRef: + name: unrip-secrets +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dummy-reactor + namespace: unrip +spec: + replicas: 1 + selector: + matchLabels: + app: dummy-reactor + template: + metadata: + labels: + app: dummy-reactor + app.kubernetes.io/part-of: unrip + spec: + imagePullSecrets: + - name: unrip-registry-creds + containers: + - name: app + image: ghcr.io/example/unrip:bootstrap + imagePullPolicy: IfNotPresent + command: ["node", "src/apps/dummy-reactor.mjs"] + envFrom: + - configMapRef: + name: unrip-config + - secretRef: + name: unrip-secrets +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dummy-executor + namespace: unrip +spec: + replicas: 1 + selector: + matchLabels: + app: dummy-executor + template: + metadata: + labels: + app: dummy-executor + app.kubernetes.io/part-of: unrip + spec: + imagePullSecrets: + - name: unrip-registry-creds + containers: + - name: app + image: ghcr.io/example/unrip:bootstrap + imagePullPolicy: IfNotPresent + command: ["node", "src/apps/dummy-executor.mjs"] + envFrom: + - configMapRef: + name: unrip-config + - secretRef: + name: unrip-secrets + volumeMounts: + - name: executor-state + mountPath: /var/lib/unrip/executor-state + volumes: + - name: executor-state + persistentVolumeClaim: + claimName: executor-state +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dummy-consumer + namespace: unrip +spec: + replicas: 1 + selector: + matchLabels: + app: dummy-consumer + template: + metadata: + labels: + app: dummy-consumer + app.kubernetes.io/part-of: unrip + spec: + imagePullSecrets: + - name: unrip-registry-creds + containers: + - name: app + image: ghcr.io/example/unrip:bootstrap + imagePullPolicy: IfNotPresent + command: ["node", "src/apps/dummy-consumer.mjs"] + envFrom: + - configMapRef: + name: unrip-config + - secretRef: + name: unrip-secrets diff --git a/deploy/k8s/secrets/README.md b/deploy/k8s/secrets/README.md new file mode 100644 index 0000000..1805b71 --- /dev/null +++ b/deploy/k8s/secrets/README.md @@ -0,0 +1,29 @@ +# Required Kubernetes secrets + +Base manifests and the Hetzner single-node overlay both expect secrets to be supplied out-of-band. The Hetzner overlay generates `unrip/unrip-secrets`, `forgejo/forgejo-secrets`, and `registry/registry-secrets` from local files. + +## Required secrets +- `unrip/unrip-secrets` + - `NEAR_INTENTS_API_KEY` +- `forgejo/forgejo-secrets` + - `root_url` + - `domain` + - `runner_registration_token` +- `registry/registry-secrets` + - `htpasswd` + +## Overlay-driven generation +The `deploy/k8s/overlays/hetzner-single-node` overlay can generate these from local files via `secretGenerator`. + +Example workflow: + +```bash +cp deploy/k8s/overlays/hetzner-single-node/secrets/unrip.env.example deploy/k8s/overlays/hetzner-single-node/secrets/unrip.env +cp deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env.example deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env +cp deploy/k8s/overlays/hetzner-single-node/secrets/registry.htpasswd.example deploy/k8s/overlays/hetzner-single-node/secrets/registry.htpasswd +kubectl apply -k deploy/k8s/overlays/hetzner-single-node +``` + +For future projects, follow the same convention with project-specific secret names in project-specific namespaces. + +Do not commit populated secret files. diff --git a/deploy/redpanda/rpk-topics.txt b/deploy/redpanda/rpk-topics.txt new file mode 100644 index 0000000..2ba6b79 --- /dev/null +++ b/deploy/redpanda/rpk-topics.txt @@ -0,0 +1,4 @@ +raw.near_intents.quote +norm.swap_demand +cmd.execute_trade +exec.trade_result diff --git a/docs/bootstrap-status-report.md b/docs/bootstrap-status-report.md new file mode 100644 index 0000000..f4f2226 --- /dev/null +++ b/docs/bootstrap-status-report.md @@ -0,0 +1,105 @@ +Status: partially successful, not fully healthy yet. + +What worked +- Hetzner VM provisioned +- k3s installed and running +- node is `Ready` +- namespaces created +- Forgejo is up +- registry is up +- Redpanda is up +- `near-intents-ingest` is up + +What is still broken +- `dummy-reactor`, `dummy-executor`, `dummy-consumer` are failing because Kafka/Redpanda topic metadata is not healthy yet: + - `This server does not host this topic-partition` +- ingress-nginx is crashing +- cert-manager webhook/cainjector are crashing +- so public HTTPS ingress is not ready +- therefore Git/registry/CI are not yet usable via domain names + +So the honest report is: +- cluster bootstrap succeeded +- platform/app stack is only partially healthy +- we still need another fix pass before calling this “working” + +How to interact with it right now + +1. Use kubectl +```bash +export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml +kubectl get nodes -o wide +kubectl get pods -A +kubectl -n unrip get pods +kubectl -n forgejo get pods,svc +kubectl -n registry get pods,svc +``` + +2. Access Forgejo right now +Since ingress is broken, use port-forward: +```bash +kubectl -n forgejo port-forward svc/forgejo 3000:3000 +``` +Then open: +```text +http://127.0.0.1:3000 +``` + +3. Access the registry right now +Also via port-forward: +```bash +kubectl -n registry port-forward svc/registry 5000:5000 +``` +Then from your machine: +```bash +docker login 127.0.0.1:5000 -u unrip +``` +And push/pull like: +```bash +docker tag unrip:bootstrap 127.0.0.1:5000/unrip:test +docker push 127.0.0.1:5000/unrip:test +``` + +4. Watch logs +```bash +kubectl -n unrip logs deploy/near-intents-ingest -f +kubectl -n unrip logs deploy/dummy-reactor -f +kubectl -n unrip logs deploy/dummy-executor -f +kubectl -n unrip logs deploy/dummy-consumer -f +kubectl -n forgejo logs deploy/forgejo -f +kubectl -n registry logs deploy/registry -f +``` + +How Git would work once Forgejo is usable +After port-forward or later ingress: +```bash +git remote add forgejo http://127.0.0.1:3000//.git +git push forgejo main +``` + +How CI/CD is supposed to work +Intended flow: +1. code lives in Forgejo +2. Forgejo runner executes `.forgejo/workflows/deploy.yml` +3. workflow builds image +4. pushes image to registry +5. updates `unrip` deployments in Kubernetes + +Current reality: +- not ready yet +- because ingress/cert-manager are unhealthy +- and we haven’t verified a full Forgejo runner deploy cycle + +Bottom line +- Kubernetes cluster: yes +- server provisioning: yes +- basic platform pieces: partially +- usable Git/CI/CD stack: not yet +- unrip app pipeline: not yet + +Most important next fixes +1. fix k3s manifest/platform issues: + - ingress-nginx RBAC/crash + - cert-manager install/CRDs/RBAC +2. fix Redpanda/topic metadata issue so reactor/executor/consumer run +3. only then wire Forgejo + registry + CI as usable diff --git a/docs/contracts.md b/docs/contracts.md new file mode 100644 index 0000000..f6e6fd0 --- /dev/null +++ b/docs/contracts.md @@ -0,0 +1,85 @@ +# Event contracts + +## Envelope +All bus messages use this envelope: + +```json +{ + "event_id": "string", + "event_type": "string", + "venue": "string", + "source": "string|null", + "schema_version": 1, + "observed_at": "ISO-8601|null", + "ingested_at": "ISO-8601", + "payload": {}, + "raw": {} +} +``` + +## Topics +Current canonical topic set: +- `raw.near_intents.quote` +- `norm.swap_demand` +- `cmd.execute_trade` +- `exec.trade_result` + +In Kubernetes bootstrap, Redpanda topic creation is currently handled by the repo-managed bootstrap job applied with the manifest set. + +## `raw.near_intents.quote` +- `event_type`: `near_intents_quote_raw` +- `payload.message`: original venue-native payload +- `raw`: original venue-native payload + +## `norm.swap_demand` +- `event_type`: `swap_demand` +- payload: + - `quote_id` + - `asset_in` + - `asset_out` + - `amount_in` + - `amount_out` + - `ttl_ms` + +## `cmd.execute_trade` +- `event_type`: `execute_trade` +- payload: + - `command_id` + - `idempotency_key` + - `execution_key` + - `quote_id` + - `asset_in` + - `asset_out` + - `amount_in` + - `amount_out` + - `reason` + +## `exec.trade_result` +- `event_type`: `trade_result` +- payload: + - `command_id` + - `idempotency_key` + - `execution_key` + - `quote_id` + - `status` + - `result_code` + - `note` + +## Executor idempotency model +- `command_id` is unique per trade command and currently deterministic as `cmd-${quote_id}` +- `idempotency_key` is stable for semantic duplicate detection and currently `${venue}:${quote_id}` +- `execution_key` is the stable partition key and currently `${venue}:${asset_in}->${asset_out}` +- executor persists command state on durable storage before publishing a result +- already-completed `command_id`s are skipped on replay or restart +- if a command is seen again after a persisted `processing` state, the executor emits a recovered result path instead of blindly duplicating work + +## Deployment and persistence implications +These contracts are tied to deployment behavior: +- executor duplicate suppression depends on durable persistence at `EXECUTOR_STATE_DIR` +- local Compose mounts that path for development/runtime testing +- the Hetzner single-node k3s path mounts persistent storage for the executor at `/var/lib/unrip/executor-state` +- in the current single-node target, that persistence is node-backed and should be treated as required operational state + +Operational consequence: +- deleting the executor PVC or losing the node without migration discards idempotency history +- that can allow already-seen commands to be treated as new after recovery diff --git a/docs/hetzner-k3s-bootstrap.md b/docs/hetzner-k3s-bootstrap.md new file mode 100644 index 0000000..b98816a --- /dev/null +++ b/docs/hetzner-k3s-bootstrap.md @@ -0,0 +1,141 @@ +# Hetzner + k3s + self-hosted Git/CI bootstrap + +Goal: provision and deploy everything from this repo to a single Hetzner machine with no manual server login. + +## Stack +- Terraform provisions the Hetzner Cloud VM, private network, and firewall +- cloud-init installs Tailscale first when configured, then installs k3s automatically +- Kubernetes manifests deploy: + - Redpanda + - trading system services + - private registry + - Forgejo + - ingress-nginx + - cert-manager + - ACME issuers +- local bootstrap script: + - runs Terraform + - optionally creates DNS records via Cloudflare or Porkbun + - writes overlay secrets/host patches from local env + - applies the Hetzner single-node k8s overlay + - builds the current app image locally + - fetches the real kubeconfig from the node + - imports the bootstrap image into k3s for the first rollout + +## Files +- `infra/terraform/hetzner/` +- `deploy/k8s/base/` +- `deploy/k8s/overlays/hetzner-single-node/` +- `scripts/hetzner/bootstrap.sh` +- `scripts/hetzner/configure-cloudflare-dns.sh` +- `scripts/hetzner/destroy.sh` +- `scripts/k8s/logs.sh` +- `.forgejo/workflows/deploy.yml` + +## Required local tools +- `terraform` +- `kubectl` +- `docker` +- `curl` +- `python3` + +## Required local env +Start from: + +```bash +cp scripts/hetzner/bootstrap-secrets.env.example scripts/hetzner/bootstrap-secrets.env +source scripts/hetzner/bootstrap-secrets.env +``` + +Required values: +- `HCLOUD_TOKEN` +- `SSH_PUBLIC_KEY_PATH` +- `PUBLIC_DOMAIN` +- `BASE_DOMAIN` +- recommended Tailscale values: + - `TAILSCALE_AUTH_KEY` + - `TAILSCALE_CONTROL_PLANE_HOSTNAME` +- `FORGEJO_DOMAIN` +- `FORGEJO_ROOT_URL` +- `REGISTRY_DOMAIN` +- `LETSENCRYPT_EMAIL` +- `REGISTRY_USERNAME` +- `REGISTRY_PASSWORD` +- `NEAR_INTENTS_API_KEY` +- `FORGEJO_RUNNER_REGISTRATION_TOKEN` + +Optional for automatic DNS: +- Cloudflare: + - `CLOUDFLARE_API_TOKEN` + - `CLOUDFLARE_ZONE_ID` +- Porkbun: + - `PORKBUN_API_KEY` + - `PORKBUN_SECRET_API_KEY` + +## Bootstrap +```bash +bash scripts/hetzner/bootstrap.sh +``` + +Outputs: +- Hetzner VM created +- Tailscale joined if configured +- k3s installed +- kubeconfig written to `.state/hetzner/kubeconfig.yaml` +- overlay secrets and ingress host patches rendered from local env +- namespaces, Redpanda, app deployments, Forgejo, registry, ingress, cert-manager, and issuers applied +- bootstrap image built and first rollout triggered + +## Tailscale-first admin access +Recommended mode: +- public firewall exposes only `80/443` +- admin access uses Tailscale +- Kubernetes API uses the Tailscale hostname when `TAILSCALE_CONTROL_PLANE_HOSTNAME` is set + +`TF_ADMIN_CIDR_BLOCKS` remains only as a fallback if you intentionally want public admin/API exposure. + +## DNS and TLS +If DNS provider credentials are present, bootstrap updates: +- `${BASE_DOMAIN}` +- `git.${BASE_DOMAIN}` +- `registry.${BASE_DOMAIN}` + +Supported scripted providers: +- Cloudflare +- Porkbun + +TLS is handled in-cluster by cert-manager using Let's Encrypt issuers and the rendered ingress hosts. + +## Observe the cluster +```bash +KUBECONFIG=.state/hetzner/kubeconfig.yaml kubectl get pods -A +bash scripts/k8s/logs.sh +``` + +## Self-hosted CI/CD handoff +After bootstrap: +1. open Forgejo at `https://${FORGEJO_DOMAIN}` +2. seed or mirror this repo into Forgejo +3. add Forgejo Actions secrets: + - `KUBECONFIG_B64` + - `REGISTRY_USERNAME` + - `REGISTRY_PASSWORD` +4. add Forgejo Actions variable: + - `REGISTRY_HOST=${REGISTRY_DOMAIN}` +5. push to `main` + +The workflow then: +- builds the image +- pushes it to `https://${REGISTRY_DOMAIN}` +- updates the app deployments in `unrip` +- waits for rollout + +## Destroy everything +```bash +bash scripts/hetzner/destroy.sh +``` + +## Current limitations +- Forgejo admin bootstrap and repo seeding are still operator-driven after the first cluster bootstrap. +- bootstrap and CI authentication paths should still be hardened before production use. +- routine deploys are intended to be registry-native through Forgejo Actions, but that still needs a real-world verification pass. diff --git a/docs/hetzner-self-hosted-ci-runbook.md b/docs/hetzner-self-hosted-ci-runbook.md new file mode 100644 index 0000000..85ef50c --- /dev/null +++ b/docs/hetzner-self-hosted-ci-runbook.md @@ -0,0 +1,108 @@ +# Hetzner self-hosted CI/CD runbook + +This is the operator runbook for the handoff from local bootstrap to self-hosted Forgejo-based deployment. + +## Bootstrap prerequisites +From your workstation: + +```bash +cp scripts/hetzner/bootstrap-secrets.env.example scripts/hetzner/bootstrap-secrets.env +source scripts/hetzner/bootstrap-secrets.env +bash scripts/hetzner/bootstrap.sh +``` + +After that you should have: +- `.state/hetzner/kubeconfig.yaml` +- Forgejo reachable at `https://${FORGEJO_DOMAIN}` +- Registry reachable at `https://${REGISTRY_DOMAIN}` +- private admin/control-plane access over Tailscale if configured + +## Verify the cluster +```bash +export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml +kubectl get nodes -o wide +kubectl get pods -A +kubectl -n forgejo get deploy,pods,svc,ingress +kubectl -n registry get deploy,pods,svc,ingress +kubectl -n unrip get deploy,pods +``` + +## Seed the repo into Forgejo +Create the target repo in Forgejo, then from your workstation: + +```bash +git remote add forgejo https://${FORGEJO_DOMAIN}//.git +git push forgejo main +``` + +## Configure Forgejo Actions secrets and variables +Create these repository secrets in Forgejo: +- `KUBECONFIG_B64` +- `REGISTRY_USERNAME` +- `REGISTRY_PASSWORD` + +Create these repository variables: +- `REGISTRY_HOST=${REGISTRY_DOMAIN}` +- optional: `PROJECT_NAME=unrip` +- optional: `PROJECT_NAMESPACE=unrip` +- optional: `PROJECT_DEPLOYMENTS=near-intents-ingest,dummy-reactor,dummy-executor,dummy-consumer` + +Generate `KUBECONFIG_B64` from the bootstrap kubeconfig: + +```bash +base64 -w0 .state/hetzner/kubeconfig.yaml +``` + +## Workflow behavior +The workflow in `.forgejo/workflows/deploy.yml` now: +1. installs `buildah` and `kubectl` on the Forgejo runner +2. checks out the repo with the Forgejo job token +3. loads kubeconfig from `KUBECONFIG_B64` +4. logs into the private registry +5. builds `registry./:${GIT_SHA}` with `buildah` +6. pushes the image +7. updates each deployment listed in `PROJECT_DEPLOYMENTS` inside `PROJECT_NAMESPACE` +8. waits for rollout after each image update + +Default behavior if you do not set project variables: +- `PROJECT_NAME=unrip` +- `PROJECT_NAMESPACE=unrip` +- `PROJECT_DEPLOYMENTS=near-intents-ingest,dummy-reactor,dummy-executor,dummy-consumer` + +For a future project, reuse the same workflow by changing only the Forgejo repository variables instead of copying the workflow. + +The first bootstrap deploy is different from routine CI: +- bootstrap fetches the real kubeconfig from the node and imports a local bootstrap image directly into k3s +- routine CI is intended to push versioned images to the private registry + +## Trigger deploys +Push to `main` in Forgejo: + +```bash +git push forgejo main +``` + +## Observe deploys +```bash +export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml +kubectl -n unrip rollout status deployment/near-intents-ingest --timeout=300s +kubectl -n unrip rollout status deployment/dummy-reactor --timeout=300s +kubectl -n unrip rollout status deployment/dummy-executor --timeout=300s +kubectl -n unrip rollout status deployment/dummy-consumer --timeout=300s +kubectl -n unrip get pods -o wide +kubectl get events -A --sort-by=.lastTimestamp | tail -n 50 +``` + +## DNS and TLS +If DNS automation was enabled during bootstrap, A records for the base, Forgejo, and registry hosts are already managed from the repo-side bootstrap. + +Currently supported DNS providers: +- Cloudflare +- Porkbun + +TLS is issued by cert-manager using the rendered Let's Encrypt email and ingress hosts. + +## Current limitations +- Forgejo admin bootstrap and repository creation are not yet API-automated. +- Forgejo repository secrets/variables still need to be populated before the first real deploy run. +- The runner currently uses host-mode jobs and installs `buildah`/`kubectl` at job start, which is functional but not yet optimized. diff --git a/docs/next-session-architecture.md b/docs/next-session-architecture.md new file mode 100644 index 0000000..20045ae --- /dev/null +++ b/docs/next-session-architecture.md @@ -0,0 +1,383 @@ +# Trading System Architecture Notes for Next Session + +## Objective +Build the first real version of the trading system as an event-driven, multi-service architecture. + +Current implemented seed: +- NEAR Intents ingest in Node.js +- Kafka-compatible bus usage via `kafkajs` +- dummy reactor / executor / result consumer loop + +Next session should continue from this architecture, not revert to a monolith, local-only script, or TUI. + +--- + +## Core Architecture +All components are independent services. +They communicate only through a central Kafka-compatible bus (Redpanda first, Kafka-compatible by design). + +### Service classes +- venue ingestors +- normalizers +- reactors / decision engines +- executors +- downstream consumers / monitors / archivers / replay tools + +### Service communication rule +No direct service-to-service calls for core trading flow. +Use bus topics only. + +--- + +## Venue-Oriented Structure +The system should be organized by venue. +Each venue can have different: +- ingest/feed mechanics +- normalization logic +- execution mechanics + +### Per-venue responsibilities +- `ingest` = venue-native intake +- `normalize` = convert venue-native payload into canonical internal event +- `execute` = venue-specific action logic + +Planned shape: +```text +src/ + apps/ + bus/ + core/ + venues/ + near-intents/ + ingest + normalize + execute +``` + +--- + +## Bus Choice +Use **Redpanda** first, but stay fully **Kafka-compatible**. + +### Reason +Requirements: +- high throughput +- low latency +- retention +- replay +- multiple producers/consumers +- independent services +- future scale-out +- multi-language compatibility + +### Constraint +Do not use broker-specific features that make migration to Kafka difficult. +Use standard Kafka clients and semantics. + +--- + +## Data Model Principles +Kafka/Redpanda is the operational event backbone. + +### Event model rules +- append-only +- immutable events +- versioned schemas +- raw and normalized events both preserved + +### Every event should include +- `event_id` +- `event_type` +- `venue` +- `observed_at` / `ingested_at` +- `schema_version` +- `payload` +- optionally raw/original payload where appropriate + +### Raw vs normalized +Keep both. +- raw topics = exact venue-native source truth +- normalized topics = canonical research/trading inputs + +This is required for: +- replay +- debugging +- future backtesting +- future Spark/batch processing + +--- + +## Current/Planned Topic Flow +Minimal 3-stage pipeline: + +1. ingest publishes normalized demand +2. reactor publishes trade command +3. executor publishes trade result + +### Topic classes +- `raw.*` = raw venue-native events +- `norm.*` = canonical normalized market events +- `cmd.*` = execution commands +- `exec.*` = execution outcomes +- later `signal.*` if needed for reactor outputs before command stage + +### Current minimal topics +- `norm.swap_demand` +- `cmd.execute_trade` +- `exec.trade_result` + +### NEAR Intents +NEAR Intents source currently feeds quote-demand style events from solver-bus websocket. +This is a venue ingest source, not the whole trading system. + +--- + +## Execution Safety / Zero Downtime Requirements +This is critical. + +### Constraint +Multiple executors must never duplicate the same trade/action during deploys, restarts, or rebalances. + +### Must-have rules +1. Every execution command must carry a unique `command_id` +2. Commands must include deterministic idempotency information +3. Executors must be idempotent +4. Executors must belong to a consumer group per executor role +5. Commands should be partitioned by a stable execution key where ordering matters +6. Executor state must be persisted durably enough to detect duplicate command execution + +### Kafka consumer groups are not sufficient alone +They help assign work, but they do not guarantee no duplicate processing under restart/rebalance conditions. +Idempotency is still required. + +### Rolling updates / zero downtime +Executors must support: +- graceful shutdown +- stop taking new work before exit +- finish or safely recover in-flight work +- commit offsets only after safe execution state transition + +### Persistence implication +Executor idempotency state is not optional metadata. +It is operational state that must survive pod restarts. + +Current single-node k3s direction: +- executor state lives at `/var/lib/unrip/executor-state` +- Kubernetes mounts that path through persistent storage +- the Hetzner single-node overlay currently targets k3s `local-path` storage +- node loss without storage migration means duplicate-suppression history is lost + +--- + +## Deployment Target +### First deployment phase +- single machine on Hetzner +- but still multiple independent services +- no architecture shortcuts that prevent future clustering + +### Future target +- split across multiple machines +- cluster capable +- fault tolerant +- multi-node +- zero-downtime deploys + +### Deployment rules from day 1 +- every component is a separate container/service +- all config via env/config files +- communication over network/bus only +- persistent components use mounted volumes/PVCs +- no manual SSH-based operational workflow + +--- + +## Infrastructure / Ops Direction +Target environment: +- Hetzner +- self-hosted CI/CD +- provisioning by code +- no GitHub dependency + +### Desired stack direction +- Terraform for Hetzner provisioning +- Kubernetes-oriented target from the start +- self-hosted Git + CI/CD +- Kafka-compatible broker +- object storage later for long-term archived event history + +### Single-node first, future cluster later +The first version may run on one machine, but deployment structure should already match a future distributed system. + +### Current canonical operator path +The repo now documents and partially implements this path as the primary deployment workflow: + +#### Phase 0: workstation bootstrap +1. A local operator workstation prepares bootstrap secrets in `scripts/hetzner/bootstrap-secrets.env`. +2. The operator runs `bash scripts/hetzner/bootstrap.sh`. +3. Terraform provisions the server, firewall, network, and cloud-init user-data. +4. cloud-init installs k3s automatically and prepares persistence directories plus bootstrap artifacts. +5. The workstation waits for the public k3s API endpoint to report ready. +6. The workstation writes `.state/hetzner/kubeconfig.yaml`. +7. The workstation injects initial Kubernetes Secrets for app and Forgejo bootstrap. +8. The workstation applies repo-managed Kubernetes manifests under `deploy/k8s/`. +9. The workstation performs the first image/bootstrap delivery attempt for the app workloads. +10. The workstation verifies rollout status. + +#### Phase 1: self-hosted handoff +1. Forgejo becomes reachable in-cluster. +2. The operator completes initial Forgejo admin/repo setup. +3. This repo is pushed or mirrored into Forgejo. +4. The Forgejo runner becomes the routine app deployment mechanism. +5. Terraform remains the infra mutation entrypoint unless further automated later. + +### Failure-recovery expectation +The bootstrap path must be rerunnable from the workstation. +Docs should keep treating recovery as: +- fix local secrets/inputs +- rerun the bootstrap script +- inspect the cluster with the generated kubeconfig +- destroy/recreate infra with `scripts/hetzner/destroy.sh` only when required + +### Current repo-state caveats +The direction is clear, but the implementation is still mid-transition: +- the bootstrap script currently applies `deploy/k8s/base` directly rather than the Hetzner overlay +- kubeconfig/auth handling is not yet fully production-hardened +- first image delivery is still a bootstrap workaround rather than a final registry-native CI path +- Forgejo admin bootstrap, repo creation, and Actions configuration still require operator steps +- local Compose remains in the repo for development/testing, not as the canonical production path + +### Minimal repo layout target +```text +deploy/ + hetzner/ + README.md + k8s/ + base/ + overlays/ + hetzner-single-node/ +infra/ + terraform/ + hetzner/ +``` + +Guidelines: +- `infra/terraform/hetzner/` owns VM, firewall, networking, and cloud-init rendering +- `deploy/k8s/` owns Kubernetes-native manifests and overlays +- app runtime manifests should remain Kubernetes-native so they can later move from single-node k3s to a larger cluster with minimal rewrite +- secret material must not live in git in plaintext; bootstrap docs should describe workstation-driven injection or generated secret references + +--- + +## Local Development / Testing Direction +Do not assume manual multi-terminal operation long term. + +### Requirement +Need an orchestrated local/dev runtime. + +### Local dev should preserve real boundaries +- separate services +- broker present +- env/config driven +- same event flow as production + +### Current local/dev answer +Compose is still acceptable for: +- developer laptops +- fast local iteration +- debugging event flow +- validating container boundaries before Kubernetes rollout + +But Compose should remain explicitly secondary to the repo-driven Hetzner + k3s path for production operations. + +### Testing layers +1. unit tests for normalizers / schema logic / helpers +2. integration tests against Kafka-compatible broker +3. replay/simulation tests using retained event streams + +--- + +## Spark Readiness +Do not add Spark now. +But keep the system Spark-compatible later by: +- preserving raw events +- preserving normalized events +- using immutable append-only event streams +- versioning schemas +- separating operational event log from future analytical processing + +Spark later would be for: +- large-scale backtesting +- feature generation +- archive processing +- multi-venue analytics + +--- + +## Immediate Next Engineering Tasks +Next session should focus on the following. + +### 1. Clean current repo structure +Remove duplicate/legacy paths and keep one canonical structure only. + +### 2. Keep/complete the 3-stage loop +- NEAR Intents ingest -> `norm.swap_demand` +- dummy reactor -> `cmd.execute_trade` +- dummy executor -> `exec.trade_result` +- downstream result consumer + +### 3. Define canonical schemas +Define concrete event schemas for: +- normalized swap demand +- execute trade command +- trade result + +### 4. Define executor idempotency model +Specify: +- `command_id` +- idempotency key rules +- execution state transition rules +- duplicate handling rules + +### 5. Move toward production-shaped deployment +Design for: +- one service per container +- single-node deployment first +- future multi-node split without app rewrite + +### 6. Harden provisioning/deployment path +Next infra work should continue improving: +- Hetzner provisioning by code +- workstation bootstrap rerunnability +- self-hosted CI/CD handoff +- registry-native image delivery +- overlay convergence for the Hetzner single-node target + +Status update: +- minimal Terraform exists under `infra/terraform/hetzner` +- first boot is cloud-init driven and installs k3s automatically +- bootstrap now starts from a local operator workstation rather than manual host login +- Kubernetes assets exist under `deploy/k8s` +- executor persistence boundaries are explicit for single-node k3s +- self-hosted CI handoff is documented, but still requires follow-up hardening + +--- + +## Non-Goals for Next Session +- no dashboards +- no UI/TUI +- no monolith convenience architecture +- no SQLite-first system of record +- no direct coupling between ingest, decision, and execution +- no temporary local-only shortcuts that block future cluster deployment + +--- + +## Guiding Principle +Build the single-node first version as if it is already a distributed system: +- separate services +- durable event bus +- replayable events +- explicit contracts +- idempotent execution +- production-compatible deployment boundaries +- bootstrapable from scratch without manual SSH-based host setup diff --git a/infra/terraform/hetzner/cloud-init.yaml.tftpl b/infra/terraform/hetzner/cloud-init.yaml.tftpl new file mode 100644 index 0000000..936fe3d --- /dev/null +++ b/infra/terraform/hetzner/cloud-init.yaml.tftpl @@ -0,0 +1,59 @@ +#cloud-config +package_update: true +package_upgrade: true +packages: + - curl + - git + - ca-certificates + - jq + - bash + - apt-transport-https +write_files: + - path: /etc/sysctl.d/90-unrip.conf + permissions: '0644' + content: | + vm.max_map_count = 262144 + - path: /usr/local/bin/bootstrap-unrip.sh + permissions: '0755' + content: | + #!/usr/bin/env bash + set -euo pipefail + + install -d -m 0755 /opt/unrip + if [ ! -d /opt/unrip/repo/.git ]; then + git clone --branch ${bootstrap_repo_branch} ${bootstrap_repo_url} /opt/unrip/repo + else + git -C /opt/unrip/repo fetch --all --prune + git -C /opt/unrip/repo checkout ${bootstrap_repo_branch} + git -C /opt/unrip/repo pull --ff-only origin ${bootstrap_repo_branch} + fi + + install -d -m 0755 /opt/unrip/bootstrap + cat >/opt/unrip/bootstrap/README.txt <<'EOF' + This node was provisioned by Terraform + cloud-init. + Future Kubernetes bootstrap assets should live in: + /opt/unrip/repo/${bootstrap_repo_path} + EOF + - path: /etc/rancher/k3s/config.yaml + permissions: '0644' + content: | + write-kubeconfig-mode: "0644" + node-name: ${node_name} + tls-san: + - ${public_domain} +%{ if tailscale_control_plane_hostname != "" ~} + - ${tailscale_control_plane_hostname} +%{ endif ~} + node-ip: ${private_ipv4_address} + advertise-address: ${private_ipv4_address} + disable: + - servicelb +runcmd: + - sysctl --system +%{ if tailscale_enabled && tailscale_auth_key != "" ~} + - curl -fsSL https://tailscale.com/install.sh | sh + - tailscale up --auth-key=${tailscale_auth_key} --ssh --hostname=${node_name} +%{ endif ~} + - curl -sfL https://get.k3s.io | INSTALL_K3S_CHANNEL=${k3s_channel} sh -s - server + - /usr/local/bin/bootstrap-unrip.sh +final_message: "k3s bootstrap finished for ${node_name}" diff --git a/infra/terraform/hetzner/main.tf b/infra/terraform/hetzner/main.tf new file mode 100644 index 0000000..93f5b55 --- /dev/null +++ b/infra/terraform/hetzner/main.tf @@ -0,0 +1,48 @@ +resource "hcloud_ssh_key" "automation" { + name = "${var.name}-automation" + public_key = var.ssh_public_key +} + +resource "hcloud_network" "trading_system" { + name = "${var.name}-network" + ip_range = var.network_cidr +} + +resource "hcloud_network_subnet" "trading_system" { + network_id = hcloud_network.trading_system.id + type = "cloud" + network_zone = var.network_zone + ip_range = var.subnet_cidr +} + +resource "hcloud_server" "trading_system" { + name = var.name + image = var.image + server_type = var.server_type + location = var.location + ssh_keys = [hcloud_ssh_key.automation.id] + firewall_ids = [hcloud_firewall.trading_system.id] + + public_net { + ipv4_enabled = true + ipv6_enabled = true + } + + network { + network_id = hcloud_network.trading_system.id + ip = var.private_ipv4_address + } + + user_data = templatefile("${path.module}/cloud-init.yaml.tftpl", { + k3s_channel = var.k3s_channel + node_name = var.name + private_ipv4_address = var.private_ipv4_address + public_domain = var.public_domain + bootstrap_repo_url = var.bootstrap_repo_url + bootstrap_repo_branch = var.bootstrap_repo_branch + bootstrap_repo_path = var.bootstrap_repo_path + tailscale_enabled = var.tailscale_enabled + tailscale_auth_key = var.tailscale_auth_key + tailscale_control_plane_hostname = var.tailscale_control_plane_hostname + }) +} diff --git a/infra/terraform/hetzner/network.tf b/infra/terraform/hetzner/network.tf new file mode 100644 index 0000000..25eab4d --- /dev/null +++ b/infra/terraform/hetzner/network.tf @@ -0,0 +1,44 @@ +resource "hcloud_firewall" "trading_system" { + name = "${var.name}-firewall" + + dynamic "rule" { + for_each = length(var.admin_cidr_blocks) > 0 ? [22] : [] + content { + direction = "in" + protocol = "tcp" + port = tostring(rule.value) + source_ips = var.admin_cidr_blocks + } + } + + rule { + direction = "in" + protocol = "tcp" + port = "80" + source_ips = ["0.0.0.0/0", "::/0"] + } + + rule { + direction = "in" + protocol = "tcp" + port = "443" + source_ips = ["0.0.0.0/0", "::/0"] + } + + dynamic "rule" { + for_each = length(var.admin_cidr_blocks) > 0 ? [6443] : [] + content { + direction = "in" + protocol = "tcp" + port = tostring(rule.value) + source_ips = var.admin_cidr_blocks + } + } + + rule { + direction = "in" + protocol = "icmp" + source_ips = ["0.0.0.0/0", "::/0"] + destination_ips = [] + } +} diff --git a/infra/terraform/hetzner/outputs.tf b/infra/terraform/hetzner/outputs.tf new file mode 100644 index 0000000..745b574 --- /dev/null +++ b/infra/terraform/hetzner/outputs.tf @@ -0,0 +1,35 @@ +output "server_ipv4" { + value = hcloud_server.trading_system.ipv4_address +} + +output "server_ipv6" { + value = hcloud_server.trading_system.ipv6_address +} + +output "server_name" { + value = hcloud_server.trading_system.name +} + +output "server_private_ipv4" { + value = var.private_ipv4_address +} + +output "server_fqdn" { + value = var.public_domain +} + +output "k3s_api_url" { + value = var.tailscale_control_plane_hostname != "" ? "https://${var.tailscale_control_plane_hostname}:6443" : "https://${hcloud_server.trading_system.ipv4_address}:6443" +} + +output "kubeconfig_strategy" { + value = var.tailscale_enabled ? "Use Tailscale for private Kubernetes API access; avoid public SSH/Kubernetes exposure in the canonical flow." : "Use the public Kubernetes API endpoint with an operator-supplied bootstrap credential; avoid SSH/scp kubeconfig retrieval in the canonical flow." +} + +output "bootstrap_repo_checkout" { + value = "/opt/unrip/repo" +} + +output "bootstrap_marker_file" { + value = "/opt/unrip/bootstrap/README.txt" +} diff --git a/infra/terraform/hetzner/providers.tf b/infra/terraform/hetzner/providers.tf new file mode 100644 index 0000000..379976b --- /dev/null +++ b/infra/terraform/hetzner/providers.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.6.0" + + required_providers { + hcloud = { + source = "hetznercloud/hcloud" + version = "~> 1.49" + } + } +} + +provider "hcloud" { + token = var.hcloud_token +} diff --git a/infra/terraform/hetzner/variables.tf b/infra/terraform/hetzner/variables.tf new file mode 100644 index 0000000..64ec6e6 --- /dev/null +++ b/infra/terraform/hetzner/variables.tf @@ -0,0 +1,111 @@ +variable "hcloud_token" { + description = "Hetzner Cloud API token" + type = string + sensitive = true +} + +variable "name" { + description = "Server name" + type = string + default = "unrip-1" +} + +variable "location" { + description = "Hetzner location" + type = string + default = "nbg1" +} + +variable "server_type" { + description = "Hetzner server type" + type = string + default = "cpx32" +} + +variable "image" { + description = "Hetzner image" + type = string + default = "ubuntu-24.04" +} + +variable "ssh_public_key" { + description = "Public SSH key content used for automation access" + type = string +} + +variable "admin_cidr_blocks" { + description = "CIDR blocks allowed to access SSH and K8s API when public admin access is enabled" + type = list(string) + default = [] +} + +variable "tailscale_enabled" { + description = "Install and use Tailscale for admin/control-plane access" + type = bool + default = true +} + +variable "tailscale_auth_key" { + description = "Tailscale auth key used for unattended node join" + type = string + sensitive = true + default = "" +} + +variable "tailscale_control_plane_hostname" { + description = "Expected Tailscale DNS name for the node; if set, bootstrap prefers it for kube access" + type = string + default = "" +} + +variable "k3s_channel" { + description = "K3s release channel" + type = string + default = "stable" +} + +variable "network_cidr" { + description = "Private network CIDR for the single-node cluster" + type = string + default = "10.30.0.0/16" +} + +variable "subnet_cidr" { + description = "Private subnet CIDR for the server attachment" + type = string + default = "10.30.1.0/24" +} + +variable "network_zone" { + description = "Hetzner network zone" + type = string + default = "eu-central" +} + +variable "private_ipv4_address" { + description = "Static private IPv4 assigned to the node on the Hetzner network" + type = string + default = "10.30.1.10" +} + +variable "public_domain" { + description = "Public DNS name pointing at the server IPv4/IPv6; used for ingress/TLS" + type = string +} + +variable "bootstrap_repo_url" { + description = "Git repository URL cloned onto the node for GitOps/bootstrap assets" + type = string +} + +variable "bootstrap_repo_branch" { + description = "Branch checked out for the bootstrap repository" + type = string + default = "main" +} + +variable "bootstrap_repo_path" { + description = "Repository subdirectory expected to contain future Kubernetes bootstrap manifests/scripts" + type = string + default = "deploy/k8s" +} diff --git a/package.json b/package.json index cf29be7..73915ca 100644 --- a/package.json +++ b/package.json @@ -5,6 +5,8 @@ "type": "module", "scripts": { "near-intents:ingest": "node src/apps/near-intents-ingest.mjs", + "dummy-reactor": "node src/apps/dummy-reactor.mjs", + "dummy-executor": "node src/apps/dummy-executor.mjs", "dummy-consumer": "node src/apps/dummy-consumer.mjs", "start": "node index.mjs" }, diff --git a/scripts/hetzner/bootstrap-secrets.env.example b/scripts/hetzner/bootstrap-secrets.env.example new file mode 100644 index 0000000..b09ddd0 --- /dev/null +++ b/scripts/hetzner/bootstrap-secrets.env.example @@ -0,0 +1,48 @@ +# Copy this file to scripts/hetzner/bootstrap-secrets.env and fill in the values. +# Then run: source scripts/hetzner/bootstrap-secrets.env + +export HCLOUD_TOKEN=replace_me +export SSH_PUBLIC_KEY_PATH="$HOME/.ssh/id_ed25519.pub" + +# Optional project override. Defaults target the built-in unrip project overlay. +export PROJECT_NAME=unrip +export PROJECT_NAMESPACE=unrip +# export PROJECT_OVERLAY_DIR="$PWD/deploy/k8s/overlays/hetzner-single-node" +# export PROJECT_KUSTOMIZE_PATH="../../projects/unrip/base" +# export PROJECT_SECRET_NAME=unrip-secrets +# export PROJECT_SECRET_ENV_BASENAME=unrip.env +# export PROJECT_REGISTRY_SECRET_NAME=unrip-registry-creds +# export PROJECT_IMAGE_REPOSITORY=unrip +# export PROJECT_DEPLOYMENTS="near-intents-ingest dummy-reactor dummy-executor dummy-consumer" + +# Tailscale-first admin access (recommended) +export TAILSCALE_AUTH_KEY= +# optional override; leave empty to auto-discover the node via local `tailscale status --json` +export TAILSCALE_CONTROL_PLANE_HOSTNAME= + +# Optional fallback if you want public admin ports instead of Tailscale +export TF_ADMIN_CIDR_BLOCKS='[]' + +# Public naming for ingress/TLS +export PUBLIC_DOMAIN=unrip-bootstrap.example.com +export BASE_DOMAIN=example.com +export FORGEJO_DOMAIN=git.example.com +export FORGEJO_ROOT_URL=https://git.example.com/ +export REGISTRY_DOMAIN=registry.example.com +export LETSENCRYPT_EMAIL=ops@example.com + +# Optional DNS automation: choose one provider +# Cloudflare +export CLOUDFLARE_API_TOKEN= +export CLOUDFLARE_ZONE_ID= +# Porkbun +export PORKBUN_API_KEY= +export PORKBUN_SECRET_API_KEY= + +# Registry auth for CI/CD and image pulls +export REGISTRY_USERNAME=unrip +export REGISTRY_PASSWORD=replace_me + +# Application and bootstrap secrets +export NEAR_INTENTS_API_KEY=replace_me +export FORGEJO_RUNNER_REGISTRATION_TOKEN=replace_me diff --git a/scripts/hetzner/bootstrap.sh b/scripts/hetzner/bootstrap.sh new file mode 100755 index 0000000..de9a6c6 --- /dev/null +++ b/scripts/hetzner/bootstrap.sh @@ -0,0 +1,302 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR=$(cd "$(dirname "$0")/../.." && pwd) +TF_DIR="$ROOT_DIR/infra/terraform/hetzner" +STATE_DIR="$ROOT_DIR/.state/hetzner" +KUBECONFIG_PATH="$STATE_DIR/kubeconfig.yaml" +OVERLAY_DIR="$ROOT_DIR/deploy/k8s/overlays/hetzner-single-node" +DEFAULT_PROJECT_NAME="unrip" +DEFAULT_PROJECT_NAMESPACE="$DEFAULT_PROJECT_NAME" +mkdir -p "$STATE_DIR" + +require() { + command -v "$1" >/dev/null 2>&1 || { echo "missing command: $1" >&2; exit 1; } +} + +wait_for_url() { + local url="$1" + local label="$2" + local max_attempts="${3:-120}" + local sleep_seconds="${4:-5}" + local attempt=1 + + until curl -kfsS "$url" >/dev/null 2>&1; do + if (( attempt >= max_attempts )); then + echo "timed out waiting for ${label}: ${url}" >&2 + return 1 + fi + if (( attempt == 1 || attempt % 6 == 0 )); then + echo "waiting for ${label} (${attempt}/${max_attempts})..." + fi + sleep "$sleep_seconds" + attempt=$((attempt + 1)) + done +} + +wait_for_ssh() { + local target="$1" + local max_attempts="${2:-120}" + local sleep_seconds="${3:-5}" + local attempt=1 + + until ssh -i "$SSH_PRIVATE_KEY_PATH" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 "$target" 'echo ssh-ready' >/dev/null 2>&1; do + if (( attempt >= max_attempts )); then + echo "timed out waiting for ssh: ${target}" >&2 + return 1 + fi + if (( attempt == 1 || attempt % 6 == 0 )); then + echo "waiting for ssh (${attempt}/${max_attempts})..." + fi + sleep "$sleep_seconds" + attempt=$((attempt + 1)) + done +} + +wait_for_tailscale_node() { + local host_name="$1" + local max_attempts="${2:-120}" + local sleep_seconds="${3:-5}" + local attempt=1 + + command -v tailscale >/dev/null 2>&1 || { + echo "tailscale CLI is required locally for tailscale-first bootstrap" >&2 + return 1 + } + + while true; do + local discovered + discovered=$(tailscale status --json 2>/dev/null | python3 - "$host_name" <<'PY' +import json,sys +host=sys.argv[1] +try: + data=json.load(sys.stdin) +except Exception: + print("") + raise SystemExit(0) +peers=data.get('Peer',{}) +matches=[] +for peer in peers.values(): + if peer.get('HostName') == host: + matches.append(peer) +for peer in sorted(matches, key=lambda p: ((p.get('Online') is True), p.get('DNSName') or ''), reverse=True): + if peer.get('Online'): + dns=(peer.get('DNSName') or '').rstrip('.') + if dns: + print(dns) + raise SystemExit(0) +for peer in sorted(matches, key=lambda p: p.get('DNSName') or '', reverse=True): + if peer.get('TailscaleIPs'): + print(peer['TailscaleIPs'][0]) + raise SystemExit(0) +print("") +PY +) + if [[ -n "$discovered" ]]; then + printf '%s\n' "$discovered" + return 0 + fi + if (( attempt >= max_attempts )); then + echo "timed out waiting for tailscale node: ${host_name}" >&2 + return 1 + fi + if (( attempt == 1 || attempt % 6 == 0 )); then + echo "waiting for tailscale node ${host_name} (${attempt}/${max_attempts})..." >&2 + fi + sleep "$sleep_seconds" + attempt=$((attempt + 1)) + done +} + +require terraform +require kubectl +require docker +require curl +require python3 +require ssh +require realpath + +: "${HCLOUD_TOKEN:?set HCLOUD_TOKEN}" +: "${SSH_PUBLIC_KEY_PATH:?set SSH_PUBLIC_KEY_PATH}" +: "${PUBLIC_DOMAIN:?set PUBLIC_DOMAIN}" +: "${LETSENCRYPT_EMAIL:?set LETSENCRYPT_EMAIL}" +: "${TAILSCALE_AUTH_KEY:=}" +: "${TAILSCALE_CONTROL_PLANE_HOSTNAME:=}" +: "${NEAR_INTENTS_API_KEY:?set NEAR_INTENTS_API_KEY}" +: "${BASE_DOMAIN:?set BASE_DOMAIN}" +: "${FORGEJO_DOMAIN:=git.${BASE_DOMAIN}}" +: "${FORGEJO_ROOT_URL:=https://${FORGEJO_DOMAIN}/}" +: "${REGISTRY_DOMAIN:=registry.${BASE_DOMAIN}}" +: "${REGISTRY_USERNAME:?set REGISTRY_USERNAME}" +: "${REGISTRY_PASSWORD:?set REGISTRY_PASSWORD}" +: "${FORGEJO_RUNNER_REGISTRATION_TOKEN:?set FORGEJO_RUNNER_REGISTRATION_TOKEN}" +: "${TF_ADMIN_CIDR_BLOCKS:=}" +: "${PROJECT_NAME:=$DEFAULT_PROJECT_NAME}" +: "${PROJECT_NAMESPACE:=$DEFAULT_PROJECT_NAMESPACE}" +: "${PROJECT_OVERLAY_DIR:=$OVERLAY_DIR}" +: "${BOOTSTRAP_NODE_NAME:=unrip-1}" +: "${SKIP_TERRAFORM_APPLY:=0}" +: "${PROJECT_KUSTOMIZE_PATH:=../../projects/${PROJECT_NAME}/base}" +: "${PROJECT_SECRET_NAME:=${PROJECT_NAME}-secrets}" +: "${PROJECT_SECRET_ENV_BASENAME:=${PROJECT_NAME}.env}" +: "${PROJECT_REGISTRY_SECRET_NAME:=${PROJECT_NAME}-registry-creds}" +: "${PROJECT_IMAGE_REPOSITORY:=${PROJECT_NAME}}" +: "${PROJECT_DEPLOYMENTS:=near-intents-ingest dummy-reactor dummy-executor dummy-consumer}" + +BOOTSTRAP_IMAGE="${PROJECT_IMAGE_REPOSITORY}:bootstrap" +PROJECT_SECRET_ENV_PATH="$PROJECT_OVERLAY_DIR/secrets/$PROJECT_SECRET_ENV_BASENAME" +GENERATED_OVERLAY_DIR="$STATE_DIR/generated-overlay" +GENERATED_OVERLAY_KUSTOMIZATION="$GENERATED_OVERLAY_DIR/kustomization.yaml" + +SSH_PUBLIC_KEY=$(cat "$SSH_PUBLIC_KEY_PATH") +SSH_PRIVATE_KEY_PATH="${SSH_PUBLIC_KEY_PATH%.pub}" +if [[ ! -f "$SSH_PRIVATE_KEY_PATH" ]]; then + echo "missing ssh private key for bootstrap: $SSH_PRIVATE_KEY_PATH" >&2 + exit 1 +fi +TF_VARS=( + -var "hcloud_token=$HCLOUD_TOKEN" + -var "ssh_public_key=$SSH_PUBLIC_KEY" + -var "public_domain=$PUBLIC_DOMAIN" + -var "bootstrap_repo_url=local-bootstrap" + -var "tailscale_auth_key=$TAILSCALE_AUTH_KEY" + -var "tailscale_control_plane_hostname=$TAILSCALE_CONTROL_PLANE_HOSTNAME" +) +if [[ -n "$TF_ADMIN_CIDR_BLOCKS" && "$TF_ADMIN_CIDR_BLOCKS" != '[]' ]]; then + TF_VARS+=(-var "admin_cidr_blocks=$TF_ADMIN_CIDR_BLOCKS") +fi + +if [[ -n "$TAILSCALE_AUTH_KEY" ]]; then + bash "$ROOT_DIR/scripts/hetzner/print-tailscale-firewall-note.sh" +fi + +terraform -chdir="$TF_DIR" init +if [[ "$SKIP_TERRAFORM_APPLY" != "1" ]]; then + terraform -chdir="$TF_DIR" apply -auto-approve "${TF_VARS[@]}" +fi + +SERVER_IP=$(terraform -chdir="$TF_DIR" output -raw server_ipv4) +K3S_API_URL=$(terraform -chdir="$TF_DIR" output -raw k3s_api_url) +if [[ -n "$TAILSCALE_AUTH_KEY" ]]; then + DISCOVERED_TAILSCALE_HOST="${TAILSCALE_CONTROL_PLANE_HOSTNAME:-$(wait_for_tailscale_node "$BOOTSTRAP_NODE_NAME")}" + SSH_TARGET="root@${DISCOVERED_TAILSCALE_HOST}" + K3S_API_URL="https://${DISCOVERED_TAILSCALE_HOST}:6443" +else + SSH_TARGET="root@${SERVER_IP}" +fi + +if [[ -n "${CLOUDFLARE_API_TOKEN:-}" && -n "${CLOUDFLARE_ZONE_ID:-}" ]]; then + if ! SERVER_IP="$SERVER_IP" BASE_DOMAIN="$BASE_DOMAIN" bash "$ROOT_DIR/scripts/hetzner/configure-cloudflare-dns.sh"; then + echo "warning: cloudflare DNS automation failed; continuing without automated DNS" >&2 + fi +elif [[ -n "${PORKBUN_API_KEY:-}" && -n "${PORKBUN_SECRET_API_KEY:-}" ]]; then + if ! SERVER_IP="$SERVER_IP" BASE_DOMAIN="$BASE_DOMAIN" bash "$ROOT_DIR/scripts/hetzner/configure-porkbun-dns.sh"; then + echo "warning: porkbun DNS automation failed; continuing without automated DNS" >&2 + fi +fi + +wait_for_ssh "$SSH_TARGET" +echo "waiting for Kubernetes API on $K3S_API_URL..." +wait_for_url "${K3S_API_URL}/readyz" "k3s API readiness" + +ssh -i "$SSH_PRIVATE_KEY_PATH" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$SSH_TARGET" 'sudo cat /etc/rancher/k3s/k3s.yaml' \ + | sed "s|https://127.0.0.1:6443|${K3S_API_URL}|" > "$KUBECONFIG_PATH" +export KUBECONFIG="$KUBECONFIG_PATH" + +mkdir -p "$PROJECT_OVERLAY_DIR/secrets" "$GENERATED_OVERLAY_DIR" +cat > "$PROJECT_SECRET_ENV_PATH" < "$PROJECT_OVERLAY_DIR/secrets/forgejo.env" </dev/null + else + api POST "/zones/$CLOUDFLARE_ZONE_ID/dns_records" "$payload" >/dev/null + fi +} + +upsert_record A "$BASE_DOMAIN" "$SERVER_IP" false +upsert_record A "git.$BASE_DOMAIN" "$SERVER_IP" false +upsert_record A "registry.$BASE_DOMAIN" "$SERVER_IP" false + +echo "cloudflare dns updated for $BASE_DOMAIN, git.$BASE_DOMAIN, registry.$BASE_DOMAIN" diff --git a/scripts/hetzner/configure-porkbun-dns.sh b/scripts/hetzner/configure-porkbun-dns.sh new file mode 100755 index 0000000..f549d82 --- /dev/null +++ b/scripts/hetzner/configure-porkbun-dns.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +set -euo pipefail + +require() { + command -v "$1" >/dev/null 2>&1 || { echo "missing command: $1" >&2; exit 1; } +} + +require curl +require python3 + +: "${PORKBUN_API_KEY:?set PORKBUN_API_KEY}" +: "${PORKBUN_SECRET_API_KEY:?set PORKBUN_SECRET_API_KEY}" +: "${BASE_DOMAIN:?set BASE_DOMAIN}" +: "${SERVER_IP:?set SERVER_IP}" + +api_base="https://api.porkbun.com/api/json/v3" + +root_name="" +git_name="git" +registry_name="registry" + +payload() { + local content="$1" + printf '{"apikey":"%s","secretapikey":"%s","content":"%s","ttl":"600"}' \ + "$PORKBUN_API_KEY" "$PORKBUN_SECRET_API_KEY" "$content" +} + +list_records() { + curl -sSf "$api_base/dns/retrieve/$BASE_DOMAIN" \ + -H 'Content-Type: application/json' \ + --data "{\"apikey\":\"$PORKBUN_API_KEY\",\"secretapikey\":\"$PORKBUN_SECRET_API_KEY\"}" +} + +upsert_a_record() { + local name="$1" + local fqdn="$BASE_DOMAIN" + [[ -n "$name" ]] && fqdn="$name.$BASE_DOMAIN" + + local record_id + record_id=$(python3 - "$fqdn" "$(list_records)" <<'PY' +import json,sys +fqdn=sys.argv[1] +data=json.loads(sys.argv[2]) +for rec in data.get('records', []): + if rec.get('type') == 'A' and rec.get('name') == fqdn: + print(rec.get('id','')) + break +PY +) + + if [[ -n "$record_id" ]]; then + curl -fsS "$api_base/dns/edit/$BASE_DOMAIN/$record_id" \ + -H 'Content-Type: application/json' \ + --data "$(payload "$SERVER_IP")" >/dev/null + echo "updated A $fqdn -> $SERVER_IP" + else + local body + body=$(printf '{"apikey":"%s","secretapikey":"%s","name":"%s","type":"A","content":"%s","ttl":"600"}' \ + "$PORKBUN_API_KEY" "$PORKBUN_SECRET_API_KEY" "$name" "$SERVER_IP") + curl -fsS "$api_base/dns/create/$BASE_DOMAIN" \ + -H 'Content-Type: application/json' \ + --data "$body" >/dev/null + echo "created A $fqdn -> $SERVER_IP" + fi +} + +upsert_a_record "$root_name" +upsert_a_record "$git_name" +upsert_a_record "$registry_name" + +echo "porkbun dns updated for $BASE_DOMAIN, git.$BASE_DOMAIN, registry.$BASE_DOMAIN" diff --git a/scripts/hetzner/destroy.sh b/scripts/hetzner/destroy.sh new file mode 100755 index 0000000..6f61bb1 --- /dev/null +++ b/scripts/hetzner/destroy.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR=$(cd "$(dirname "$0")/../.." && pwd) +TF_DIR="$ROOT_DIR/infra/terraform/hetzner" + +: "${HCLOUD_TOKEN:?set HCLOUD_TOKEN}" +: "${SSH_PUBLIC_KEY_PATH:?set SSH_PUBLIC_KEY_PATH}" +: "${PUBLIC_DOMAIN:=bootstrap.example.com}" +: "${TAILSCALE_AUTH_KEY:=}" +: "${TAILSCALE_CONTROL_PLANE_HOSTNAME:=}" +: "${TF_ADMIN_CIDR_BLOCKS:=}" + +SSH_PUBLIC_KEY=$(cat "$SSH_PUBLIC_KEY_PATH") +TF_VARS=( + -var "hcloud_token=$HCLOUD_TOKEN" + -var "ssh_public_key=$SSH_PUBLIC_KEY" + -var "public_domain=$PUBLIC_DOMAIN" + -var "tailscale_auth_key=$TAILSCALE_AUTH_KEY" + -var "tailscale_control_plane_hostname=$TAILSCALE_CONTROL_PLANE_HOSTNAME" +) + +if [[ -n "$TF_ADMIN_CIDR_BLOCKS" && "$TF_ADMIN_CIDR_BLOCKS" != '[]' ]]; then + TF_VARS+=( -var "admin_cidr_blocks=$TF_ADMIN_CIDR_BLOCKS" ) +fi + +terraform -chdir="$TF_DIR" init +terraform -chdir="$TF_DIR" destroy -auto-approve "${TF_VARS[@]}" diff --git a/scripts/hetzner/print-tailscale-firewall-note.sh b/scripts/hetzner/print-tailscale-firewall-note.sh new file mode 100755 index 0000000..74d0934 --- /dev/null +++ b/scripts/hetzner/print-tailscale-firewall-note.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -euo pipefail +cat <<'EOF' +Tailscale-first mode: +- public firewall should expose only 80/443 +- SSH and Kubernetes API should be reached over Tailscale +- ensure your workstation is authenticated to the same tailnet before bootstrap continues +EOF diff --git a/scripts/k8s/logs.sh b/scripts/k8s/logs.sh new file mode 100755 index 0000000..1bb9403 --- /dev/null +++ b/scripts/k8s/logs.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail + +KUBECONFIG_PATH=${KUBECONFIG:-$(pwd)/.state/hetzner/kubeconfig.yaml} +PROJECT_NAMESPACE=${PROJECT_NAMESPACE:-unrip} +export KUBECONFIG="$KUBECONFIG_PATH" + +kubectl -n "$PROJECT_NAMESPACE" get pods +kubectl -n "$PROJECT_NAMESPACE" logs -l app=near-intents-ingest --tail=100 -f & +kubectl -n "$PROJECT_NAMESPACE" logs -l app=dummy-reactor --tail=100 -f & +kubectl -n "$PROJECT_NAMESPACE" logs -l app=dummy-executor --tail=100 -f & +kubectl -n "$PROJECT_NAMESPACE" logs -l app=dummy-consumer --tail=100 -f & +wait diff --git a/src/apps/dummy-consumer.mjs b/src/apps/dummy-consumer.mjs index 82a0aea..247e61c 100644 --- a/src/apps/dummy-consumer.mjs +++ b/src/apps/dummy-consumer.mjs @@ -2,25 +2,24 @@ import process from 'node:process'; import { createConsumer } from '../bus/kafka/consumer.mjs'; import { logStatus } from '../core/log.mjs'; +import { parseEventMessage } from '../core/event-envelope.mjs'; +import { assertTradeResult } from '../core/schemas.mjs'; import { loadConfig } from '../lib/config.mjs'; const config = loadConfig(); const consumer = await createConsumer({ - groupId: config.kafkaConsumerGroupDummy, + groupId: `${config.kafkaConsumerGroupExecutor}-results-view`, brokers: config.kafkaBrokers, clientId: config.kafkaClientId, }); -await consumer.subscribe({ topic: config.kafkaTopicNormSwapDemand, fromBeginning: false }); -logStatus( - `dummy consumer subscribed to ${config.kafkaTopicNormSwapDemand} as ${config.kafkaConsumerGroupDummy}`, -); +await consumer.subscribe({ topic: config.kafkaTopicExecTradeResult, fromBeginning: false }); +logStatus(`result consumer subscribed to ${config.kafkaTopicExecTradeResult}`); process.on('SIGINT', async () => { await consumer.disconnect(); process.exit(0); }); - process.on('SIGTERM', async () => { await consumer.disconnect(); process.exit(0); @@ -29,18 +28,15 @@ process.on('SIGTERM', async () => { await consumer.run({ eachMessage: async ({ message }) => { if (!message.value) return; - let event; try { - event = JSON.parse(message.value.toString()); + event = parseEventMessage(message.value.toString()); } catch { - logStatus('dummy consumer received non-JSON message; skipping'); + logStatus('result consumer received non-JSON message; skipping'); return; } - - const payload = event?.payload || {}; - const pair = `${payload.assetIn || '?'} -> ${payload.assetOut || '?'}`; - const quoteId = payload.quoteId || event.eventId || '?'; - console.log(`[dummy-reactor] observed ${pair} quote_id=${quoteId} | would decide later`); + assertTradeResult(event); + const payload = event.payload; + console.log(`[result] command_id=${payload.command_id} quote_id=${payload.quote_id} status=${payload.status} result_code=${payload.result_code || 'n/a'}`); }, }); diff --git a/src/apps/dummy-executor.mjs b/src/apps/dummy-executor.mjs new file mode 100644 index 0000000..0992381 --- /dev/null +++ b/src/apps/dummy-executor.mjs @@ -0,0 +1,93 @@ +import process from 'node:process'; + +import { createConsumer } from '../bus/kafka/consumer.mjs'; +import { createProducer } from '../bus/kafka/producer.mjs'; +import { buildEventEnvelope, parseEventMessage } from '../core/event-envelope.mjs'; +import { createExecutorStateStore } from '../core/executor-state-store.mjs'; +import { logStatus } from '../core/log.mjs'; +import { assertExecuteTradeCommand, assertTradeResult } from '../core/schemas.mjs'; +import { loadConfig } from '../lib/config.mjs'; + +const config = loadConfig(); + +const consumer = await createConsumer({ + groupId: config.kafkaConsumerGroupExecutor, + brokers: config.kafkaBrokers, + clientId: config.kafkaClientId, +}); +const producer = await createProducer({ + brokers: config.kafkaBrokers, + clientId: config.kafkaClientId, +}); + +const stateStore = createExecutorStateStore({ stateDir: config.executorStateDir }); + +await consumer.subscribe({ topic: config.kafkaTopicCmdExecuteTrade, fromBeginning: false }); +logStatus(`dummy executor subscribed to ${config.kafkaTopicCmdExecuteTrade} as ${config.kafkaConsumerGroupExecutor}`); +logStatus(`dummy executor will publish results to ${config.kafkaTopicExecTradeResult}; state_dir=${config.executorStateDir}`); + +async function shutdown() { + await consumer.disconnect(); + await producer.disconnect(); + process.exit(0); +} +process.on('SIGINT', shutdown); +process.on('SIGTERM', shutdown); + +await consumer.run({ + eachMessage: async ({ message }) => { + if (!message.value) return; + + let event; + try { + event = parseEventMessage(message.value.toString()); + } catch { + logStatus('dummy executor received non-JSON message; skipping'); + return; + } + + assertExecuteTradeCommand(event); + + const payload = event.payload; + const commandId = payload.command_id; + const existing = stateStore.get(commandId); + if (existing?.status === 'completed') { + logStatus(`dummy executor skipping duplicate command_id=${commandId}`); + return; + } + + stateStore.markProcessing(commandId, { + idempotency_key: payload.idempotency_key, + execution_key: payload.execution_key, + quote_id: payload.quote_id, + }); + + const pair = `${payload.asset_in} -> ${payload.asset_out}`; + const result = buildEventEnvelope({ + source: 'dummy-executor', + venue: event.venue || 'near-intents', + eventType: 'trade_result', + eventId: `exec-${commandId}`, + observedAt: event.observed_at, + payload: { + command_id: commandId, + idempotency_key: payload.idempotency_key, + execution_key: payload.execution_key, + quote_id: payload.quote_id, + status: 'simulated_sent', + result_code: existing?.status === 'processing' ? 'recovered_inflight' : 'sent', + note: 'dummy executor placeholder result', + }, + }); + assertTradeResult(result); + + await producer.sendJson(config.kafkaTopicExecTradeResult, result, { key: payload.execution_key }); + stateStore.markCompleted(commandId, { + idempotency_key: payload.idempotency_key, + execution_key: payload.execution_key, + quote_id: payload.quote_id, + result_event_id: result.event_id, + }); + console.log(`[dummy-executor] result emitted ${pair} quote_id=${payload.quote_id} command_id=${commandId} status=simulated_sent`); + }, +}); diff --git a/src/apps/dummy-reactor.mjs b/src/apps/dummy-reactor.mjs new file mode 100644 index 0000000..1a8b5ab --- /dev/null +++ b/src/apps/dummy-reactor.mjs @@ -0,0 +1,75 @@ +import process from 'node:process'; + +import { createConsumer } from '../bus/kafka/consumer.mjs'; +import { createProducer } from '../bus/kafka/producer.mjs'; +import { logStatus } from '../core/log.mjs'; +import { loadConfig } from '../lib/config.mjs'; +import { buildEventEnvelope, parseEventMessage } from '../core/event-envelope.mjs'; +import { assertExecuteTradeCommand, assertNormalizedSwapDemand } from '../core/schemas.mjs'; + +const config = loadConfig(); + +const consumer = await createConsumer({ + groupId: config.kafkaConsumerGroupDummy, + brokers: config.kafkaBrokers, + clientId: config.kafkaClientId, +}); +const producer = await createProducer({ + brokers: config.kafkaBrokers, + clientId: config.kafkaClientId, +}); + +await consumer.subscribe({ topic: config.kafkaTopicNormSwapDemand, fromBeginning: false }); +logStatus(`dummy reactor subscribed to ${config.kafkaTopicNormSwapDemand} as ${config.kafkaConsumerGroupDummy}`); +logStatus(`dummy reactor will publish commands to ${config.kafkaTopicCmdExecuteTrade}`); + +async function shutdown() { + await consumer.disconnect(); + await producer.disconnect(); + process.exit(0); +} +process.on('SIGINT', shutdown); +process.on('SIGTERM', shutdown); + +await consumer.run({ + eachMessage: async ({ message }) => { + if (!message.value) return; + + let event; + try { + event = parseEventMessage(message.value.toString()); + } catch { + logStatus('dummy reactor received non-JSON message; skipping'); + return; + } + + assertNormalizedSwapDemand(event); + + const payload = event.payload; + const pair = `${payload.asset_in} -> ${payload.asset_out}`; + const quoteId = payload.quote_id; + const commandId = `cmd-${quoteId}`; + const command = buildEventEnvelope({ + source: 'dummy-reactor', + venue: event.venue || 'near-intents', + eventType: 'execute_trade', + eventId: commandId, + observedAt: event.observed_at, + payload: { + command_id: commandId, + idempotency_key: `${event.venue || 'near-intents'}:${quoteId}`, + execution_key: `${event.venue || 'near-intents'}:${payload.asset_in}->${payload.asset_out}`, + quote_id: quoteId, + asset_in: payload.asset_in, + asset_out: payload.asset_out, + amount_in: payload.amount_in, + amount_out: payload.amount_out, + reason: 'dummy reactor placeholder decision', + }, + }); + assertExecuteTradeCommand(command); + + await producer.sendJson(config.kafkaTopicCmdExecuteTrade, command, { key: command.payload.execution_key }); + console.log(`[dummy-reactor] command emitted ${pair} quote_id=${quoteId} command_id=${commandId}`); + }, +}); diff --git a/src/apps/near-intents-ingest.mjs b/src/apps/near-intents-ingest.mjs index edd9c8a..27f51c1 100644 --- a/src/apps/near-intents-ingest.mjs +++ b/src/apps/near-intents-ingest.mjs @@ -18,7 +18,7 @@ const producer = await createProducer({ brokers: config.kafkaBrokers, clientId: config.kafkaClientId, }); -logStatus(`kafka producer connected; topic=${config.kafkaTopicNormSwapDemand}`); +logStatus(`kafka producer connected; raw_topic=${config.kafkaTopicRawNearIntentsQuote}; normalized_topic=${config.kafkaTopicNormSwapDemand}`); if (pairFilter) logStatus(`pair filter enabled: ${pairFilter[0]} <-> ${pairFilter[1]}`); process.on('SIGINT', async () => { @@ -36,5 +36,6 @@ await startNearIntentsWs({ wsUrl: config.nearIntentsWsUrl, pairFilter, producer, - topic: config.kafkaTopicNormSwapDemand, + rawTopic: config.kafkaTopicRawNearIntentsQuote, + normalizedTopic: config.kafkaTopicNormSwapDemand, }); diff --git a/src/bus/kafka.mjs b/src/bus/kafka.mjs deleted file mode 100644 index 091dc27..0000000 --- a/src/bus/kafka.mjs +++ /dev/null @@ -1,27 +0,0 @@ -import { Kafka } from 'kafkajs'; - -function brokersFromEnv() { - return (process.env.KAFKA_BROKERS || '127.0.0.1:9092') - .split(',') - .map((x) => x.trim()) - .filter(Boolean); -} - -export function createKafka() { - return new Kafka({ - clientId: process.env.KAFKA_CLIENT_ID || 'trading-system', - brokers: brokersFromEnv(), - }); -} - -export async function createProducer() { - const producer = createKafka().producer(); - await producer.connect(); - return producer; -} - -export async function createConsumer({ groupId }) { - const consumer = createKafka().consumer({ groupId }); - await consumer.connect(); - return consumer; -} diff --git a/src/bus/kafka/consumer.mjs b/src/bus/kafka/consumer.mjs index ceb886c..c9d7b47 100644 --- a/src/bus/kafka/consumer.mjs +++ b/src/bus/kafka/consumer.mjs @@ -1,6 +1,6 @@ import { Kafka } from 'kafkajs'; -function createKafka({ brokers = ['127.0.0.1:9092'], clientId = 'trading-system' } = {}) { +function createKafka({ brokers = ['127.0.0.1:9092'], clientId = 'unrip' } = {}) { return new Kafka({ clientId, brokers }); } diff --git a/src/bus/kafka/producer.mjs b/src/bus/kafka/producer.mjs index f674440..da4d55f 100644 --- a/src/bus/kafka/producer.mjs +++ b/src/bus/kafka/producer.mjs @@ -1,6 +1,6 @@ import { Kafka } from 'kafkajs'; -function createKafka({ brokers = ['127.0.0.1:9092'], clientId = 'trading-system' } = {}) { +function createKafka({ brokers = ['127.0.0.1:9092'], clientId = 'unrip' } = {}) { return new Kafka({ clientId, brokers }); } @@ -8,7 +8,7 @@ export async function createProducer(options = {}) { const producer = createKafka(options).producer(); await producer.connect(); return { - async sendJson(topic, event, { key = event?.eventId ?? event?.key ?? null } = {}) { + async sendJson(topic, event, { key = event?.event_id ?? event?.key ?? null } = {}) { await producer.send({ topic, messages: [{ key, value: JSON.stringify(event) }], diff --git a/src/core/env.mjs b/src/core/env.mjs deleted file mode 100644 index 9dc6325..0000000 --- a/src/core/env.mjs +++ /dev/null @@ -1,13 +0,0 @@ -import fs from 'node:fs'; - -export function loadDotenv(path = '.env') { - if (!fs.existsSync(path)) return; - const lines = fs.readFileSync(path, 'utf8').split(/\r?\n/); - for (const raw of lines) { - const line = raw.trim(); - if (!line || line.startsWith('#') || !line.includes('=')) continue; - const [key, ...rest] = line.split('='); - const value = rest.join('=').trim().replace(/^['"]|['"]$/g, ''); - if (!(key.trim() in process.env)) process.env[key.trim()] = value; - } -} diff --git a/src/core/event-envelope.mjs b/src/core/event-envelope.mjs index 632dc6a..38961cd 100644 --- a/src/core/event-envelope.mjs +++ b/src/core/event-envelope.mjs @@ -1,14 +1,41 @@ import crypto from 'node:crypto'; -export function makeEventEnvelope({ venue, eventType, payload, raw = null, key = null }) { +export function buildEventEnvelope({ + eventType, + venue, + payload, + source, + eventId = crypto.randomUUID(), + schemaVersion = 1, + observedAt = null, + ingestedAt = new Date(), + raw = null, +}) { + if (!eventType) throw new Error('Missing eventType'); + if (!venue) throw new Error('Missing venue'); + if (payload == null) throw new Error('Missing payload'); + return { - event_id: crypto.randomUUID(), - schema_version: 1, - venue, - event_type: eventType, - observed_at: new Date().toISOString(), - key, + event_id: String(eventId), + event_type: String(eventType), + venue: String(venue), + source: source ? String(source) : null, + schema_version: Number(schemaVersion), + observed_at: toIsoStringOrNull(observedAt), + ingested_at: toIsoStringOrNull(ingestedAt) ?? new Date().toISOString(), payload, raw, }; } + +export function parseEventMessage(value) { + const event = typeof value === 'string' ? JSON.parse(value) : value; + if (!event || typeof event !== 'object') throw new Error('Event must be an object'); + return event; +} + +function toIsoStringOrNull(value) { + if (value == null) return null; + const date = value instanceof Date ? value : new Date(value); + return Number.isNaN(date.getTime()) ? null : date.toISOString(); +} diff --git a/src/core/executor-state-store.mjs b/src/core/executor-state-store.mjs new file mode 100644 index 0000000..b208c10 --- /dev/null +++ b/src/core/executor-state-store.mjs @@ -0,0 +1,49 @@ +import fs from 'node:fs'; +import path from 'node:path'; + +export function createExecutorStateStore({ stateDir, fileName = 'commands.json' }) { + fs.mkdirSync(stateDir, { recursive: true }); + const filePath = path.join(stateDir, fileName); + const state = loadState(filePath); + + return { + get(commandId) { + return state[commandId] || null; + }, + markProcessing(commandId, metadata) { + state[commandId] = { + ...(state[commandId] || {}), + ...metadata, + status: 'processing', + updated_at: new Date().toISOString(), + }; + persistState(filePath, state); + return state[commandId]; + }, + markCompleted(commandId, metadata) { + state[commandId] = { + ...(state[commandId] || {}), + ...metadata, + status: 'completed', + updated_at: new Date().toISOString(), + }; + persistState(filePath, state); + return state[commandId]; + }, + }; +} + +function loadState(filePath) { + if (!fs.existsSync(filePath)) return {}; + try { + return JSON.parse(fs.readFileSync(filePath, 'utf8')); + } catch { + return {}; + } +} + +function persistState(filePath, state) { + const tempPath = `${filePath}.tmp`; + fs.writeFileSync(tempPath, JSON.stringify(state, null, 2)); + fs.renameSync(tempPath, filePath); +} diff --git a/src/core/schemas.mjs b/src/core/schemas.mjs new file mode 100644 index 0000000..50ca90d --- /dev/null +++ b/src/core/schemas.mjs @@ -0,0 +1,63 @@ +function requireString(value, field) { + if (typeof value !== 'string' || value.length === 0) throw new Error(`Missing ${field}`); +} + +function requireObject(value, field) { + if (!value || typeof value !== 'object' || Array.isArray(value)) throw new Error(`Missing ${field}`); +} + +export function assertEventEnvelope(event) { + requireObject(event, 'event'); + requireString(event.event_id, 'event.event_id'); + requireString(event.event_type, 'event.event_type'); + requireString(event.venue, 'event.venue'); + if (event.source != null) requireString(event.source, 'event.source'); + if (typeof event.schema_version !== 'number') throw new Error('Missing event.schema_version'); + requireString(event.ingested_at, 'event.ingested_at'); + requireObject(event.payload, 'event.payload'); + return event; +} + +export function assertNormalizedSwapDemand(event) { + assertEventEnvelope(event); + if (event.event_type !== 'swap_demand') throw new Error(`Unexpected event_type: ${event.event_type}`); + + const payload = event.payload; + requireString(payload.quote_id, 'payload.quote_id'); + requireString(payload.asset_in, 'payload.asset_in'); + requireString(payload.asset_out, 'payload.asset_out'); + if (payload.amount_in != null) requireString(payload.amount_in, 'payload.amount_in'); + if (payload.amount_out != null) requireString(payload.amount_out, 'payload.amount_out'); + if (payload.ttl_ms != null) requireString(payload.ttl_ms, 'payload.ttl_ms'); + return event; +} + +export function assertExecuteTradeCommand(event) { + assertEventEnvelope(event); + if (event.event_type !== 'execute_trade') throw new Error(`Unexpected event_type: ${event.event_type}`); + + const payload = event.payload; + requireString(payload.command_id, 'payload.command_id'); + requireString(payload.idempotency_key, 'payload.idempotency_key'); + requireString(payload.execution_key, 'payload.execution_key'); + requireString(payload.quote_id, 'payload.quote_id'); + requireString(payload.asset_in, 'payload.asset_in'); + requireString(payload.asset_out, 'payload.asset_out'); + if (payload.amount_in != null) requireString(payload.amount_in, 'payload.amount_in'); + if (payload.amount_out != null) requireString(payload.amount_out, 'payload.amount_out'); + return event; +} + +export function assertTradeResult(event) { + assertEventEnvelope(event); + if (event.event_type !== 'trade_result') throw new Error(`Unexpected event_type: ${event.event_type}`); + + const payload = event.payload; + requireString(payload.command_id, 'payload.command_id'); + requireString(payload.idempotency_key, 'payload.idempotency_key'); + requireString(payload.execution_key, 'payload.execution_key'); + requireString(payload.quote_id, 'payload.quote_id'); + requireString(payload.status, 'payload.status'); + if (payload.result_code != null) requireString(payload.result_code, 'payload.result_code'); + return event; +} diff --git a/src/lib/config.mjs b/src/lib/config.mjs index 9d7b604..4e261cb 100644 --- a/src/lib/config.mjs +++ b/src/lib/config.mjs @@ -3,9 +3,14 @@ import { loadDotenv } from './env.mjs'; const DEFAULTS = { nearIntentsWsUrl: 'wss://solver-relay-v2.chaindefuser.com/ws', kafkaBrokers: ['127.0.0.1:9092'], - kafkaClientId: 'trading-system', + kafkaClientId: 'unrip', + kafkaTopicRawNearIntentsQuote: 'raw.near_intents.quote', kafkaTopicNormSwapDemand: 'norm.swap_demand', + kafkaTopicCmdExecuteTrade: 'cmd.execute_trade', + kafkaTopicExecTradeResult: 'exec.trade_result', kafkaConsumerGroupDummy: 'dummy-reactor-v1', + kafkaConsumerGroupExecutor: 'dummy-executor-v1', + executorStateDir: './var/executor-state', }; function splitCsv(value) { @@ -16,6 +21,12 @@ function splitCsv(value) { } export function loadConfig({ envPath = '.env' } = {}) { + // Runtime config stays environment-first so the same app build works for: + // - local `.env` development + // - Docker/Compose + // - Kubernetes Secret/ConfigMap injection during Hetzner bootstrap + // This is what lets the local workstation bootstrap provision infra and then + // deploy the exact same image into k3s without app-level config rewrites. loadDotenv(envPath); return { @@ -25,9 +36,19 @@ export function loadConfig({ envPath = '.env' } = {}) { ? splitCsv(process.env.KAFKA_BROKERS) : DEFAULTS.kafkaBrokers, kafkaClientId: process.env.KAFKA_CLIENT_ID || DEFAULTS.kafkaClientId, + kafkaTopicRawNearIntentsQuote: + process.env.KAFKA_TOPIC_RAW_NEAR_INTENTS_QUOTE || DEFAULTS.kafkaTopicRawNearIntentsQuote, kafkaTopicNormSwapDemand: process.env.KAFKA_TOPIC_NORM_SWAP_DEMAND || DEFAULTS.kafkaTopicNormSwapDemand, + kafkaTopicCmdExecuteTrade: + process.env.KAFKA_TOPIC_CMD_EXECUTE_TRADE || DEFAULTS.kafkaTopicCmdExecuteTrade, + kafkaTopicExecTradeResult: + process.env.KAFKA_TOPIC_EXEC_TRADE_RESULT || DEFAULTS.kafkaTopicExecTradeResult, kafkaConsumerGroupDummy: process.env.KAFKA_CONSUMER_GROUP_DUMMY || DEFAULTS.kafkaConsumerGroupDummy, + kafkaConsumerGroupExecutor: + process.env.KAFKA_CONSUMER_GROUP_EXECUTOR || DEFAULTS.kafkaConsumerGroupExecutor, + executorStateDir: + process.env.EXECUTOR_STATE_DIR || DEFAULTS.executorStateDir, }; } diff --git a/src/lib/env.mjs b/src/lib/env.mjs index 9dc6325..1397e77 100644 --- a/src/lib/env.mjs +++ b/src/lib/env.mjs @@ -1,5 +1,9 @@ import fs from 'node:fs'; +// `.env` loading is a local/dev convenience only. +// In the repo-driven Hetzner+k3s bootstrap flow, Kubernetes injects runtime +// environment variables from Secrets/ConfigMaps and already-present process.env +// values always win over anything on disk. export function loadDotenv(path = '.env') { if (!fs.existsSync(path)) return; const lines = fs.readFileSync(path, 'utf8').split(/\r?\n/); diff --git a/src/lib/event-envelope.mjs b/src/lib/event-envelope.mjs deleted file mode 100644 index 445fb1a..0000000 --- a/src/lib/event-envelope.mjs +++ /dev/null @@ -1,37 +0,0 @@ -export function buildEventEnvelope({ - source, - venue, - eventType, - eventId, - occurredAt = null, - ingestedAt = new Date(), - payload, -}) { - if (!source) throw new Error('Missing source'); - if (!venue) throw new Error('Missing venue'); - if (!eventType) throw new Error('Missing eventType'); - if (!eventId) throw new Error('Missing eventId'); - - const ingestedDate = parseDate(ingestedAt) ?? new Date(); - - return { - source: String(source), - venue: String(venue), - eventType: String(eventType), - eventId: String(eventId), - occurredAt: toIsoStringOrNull(occurredAt), - ingestedAt: ingestedDate.toISOString(), - payload, - }; -} - -function toIsoStringOrNull(value) { - const date = parseDate(value); - return date ? date.toISOString() : null; -} - -function parseDate(value) { - if (value == null) return null; - const date = value instanceof Date ? value : new Date(value); - return Number.isNaN(date.getTime()) ? null : date; -} diff --git a/src/venues/near-intents/normalize.mjs b/src/venues/near-intents/normalize.mjs index fc8ab07..8bd60cc 100644 --- a/src/venues/near-intents/normalize.mjs +++ b/src/venues/near-intents/normalize.mjs @@ -1,4 +1,21 @@ -import { buildEventEnvelope } from '../../lib/event-envelope.mjs'; +import { buildEventEnvelope } from '../../core/event-envelope.mjs'; + +export function buildNearIntentsRawEnvelope(message, { ingestedAt = new Date() } = {}) { + const raw = isRecord(message) ? message : {}; + const quoteId = first(raw, ['quote_id', 'quoteRequestId', 'request_id', 'id', 'quote_hash']); + const occurredAt = first(raw, ['created_at', 'createdAt', 'timestamp', 'ts']); + + return buildEventEnvelope({ + source: 'near-intents.ws', + venue: 'near-intents', + eventType: 'near_intents_quote_raw', + eventId: quoteId || `near-intents-raw-${ingestedAt.getTime()}`, + observedAt: occurredAt, + ingestedAt, + payload: { message: raw }, + raw, + }); +} export function buildNearIntentsQuoteEnvelope(message, { ingestedAt = new Date() } = {}) { const raw = isRecord(message) ? message : {}; @@ -10,11 +27,12 @@ export function buildNearIntentsQuoteEnvelope(message, { ingestedAt = new Date() return buildEventEnvelope({ source: 'near-intents.ws', venue: 'near-intents', - eventType: 'quote', - eventId: payload.quoteId, - occurredAt, + eventType: 'swap_demand', + eventId: payload.quote_id, + observedAt: occurredAt, ingestedAt, payload, + raw, }); } @@ -25,12 +43,12 @@ export function normalizeNearIntentsQuote(message) { if (!quoteId || !assetIn || !assetOut) return null; return { - quoteId: String(quoteId), - assetIn: String(assetIn), - assetOut: String(assetOut), - amountIn: stringify(first(message, ['exact_amount_in', 'sellAmount', 'amount_in'])), - amountOut: stringify(first(message, ['exact_amount_out', 'buyAmount', 'amount_out', 'expectedOut', 'quoted_amount_out'])), - ttlMs: stringify(first(message, ['min_deadline_ms', 'ttl_ms', 'deadline_ms'])), + quote_id: String(quoteId), + asset_in: String(assetIn), + asset_out: String(assetOut), + amount_in: stringify(first(message, ['exact_amount_in', 'sellAmount', 'amount_in'])), + amount_out: stringify(first(message, ['exact_amount_out', 'buyAmount', 'amount_out', 'expectedOut', 'quoted_amount_out'])), + ttl_ms: stringify(first(message, ['min_deadline_ms', 'ttl_ms', 'deadline_ms'])), }; } diff --git a/src/venues/near-intents/ws.mjs b/src/venues/near-intents/ws.mjs index 8032a50..f18ef54 100644 --- a/src/venues/near-intents/ws.mjs +++ b/src/venues/near-intents/ws.mjs @@ -1,6 +1,7 @@ import { matchesPairFilter } from '../../core/pair-filter.mjs'; import { logStatus, startIdleHeartbeat } from '../../core/log.mjs'; -import { buildNearIntentsQuoteEnvelope } from './normalize.mjs'; +import { assertNormalizedSwapDemand } from '../../core/schemas.mjs'; +import { buildNearIntentsQuoteEnvelope, buildNearIntentsRawEnvelope } from './normalize.mjs'; const DEFAULT_WS_URL = 'wss://solver-relay-v2.chaindefuser.com/ws'; const QUOTE_SUB_ID = 1; @@ -11,7 +12,8 @@ export async function startNearIntentsWs({ wsUrl = DEFAULT_WS_URL, pairFilter, producer, - topic, + rawTopic, + normalizedTopic, onPublish = defaultOnPublish, }) { if (!apiKey) throw new Error('Missing NEAR_INTENTS_API_KEY'); @@ -63,17 +65,20 @@ export async function startNearIntentsWs({ if (quoteSubscriptionId && subscription && subscription !== quoteSubscriptionId) return; if (publishLocked) return; + const rawEnvelope = buildNearIntentsRawEnvelope(merged); const envelope = buildNearIntentsQuoteEnvelope(merged); if (!envelope) return; + assertNormalizedSwapDemand(envelope); - const assetIn = envelope.payload?.assetIn; - const assetOut = envelope.payload?.assetOut; + const assetIn = envelope.payload?.asset_in; + const assetOut = envelope.payload?.asset_out; if (!assetIn || !assetOut) return; if (!matchesPairFilter(assetIn, assetOut, pairFilter)) return; publishLocked = true; try { - await producer.sendJson(topic, envelope, { key: envelope.eventId }); + await producer.sendJson(rawTopic, rawEnvelope, { key: rawEnvelope.event_id }); + await producer.sendJson(normalizedTopic, envelope, { key: envelope.payload.quote_id }); publishedCount += 1; onPublish(envelope, publishedCount); } catch (error) {