From 15ec32bece9ba399db07becdf400e8eb44e1b223 Mon Sep 17 00:00:00 2001 From: Philipp Date: Sun, 29 Mar 2026 13:45:34 +0200 Subject: [PATCH] chore: reconcile hetzner bootstrap docs and state --- .env.example | 2 +- README.md | 266 +++----------- deploy/hetzner/README.md | 326 +++++------------- deploy/hetzner/cloud-init.k3s-first-node.yaml | 11 +- deploy/k8s/README.md | 9 +- .../overlays/hetzner-single-node/README.md | 88 ++++- .../secrets/forgejo.env.example | 1 - deploy/k8s/platform/base/forgejo-runner.yaml | 40 ++- deploy/k8s/platform/base/ingress-nginx.yaml | 73 ---- deploy/k8s/platform/base/namespace.yaml | 9 +- deploy/k8s/projects/README.md | 9 +- docs/bootstrap-status-report.md | 115 +----- docs/hetzner-k3s-bootstrap.md | 7 +- docs/hetzner-rebuild-pipeline.md | 117 +++++++ infra/terraform/hetzner/cloud-init.yaml.tftpl | 19 +- infra/terraform/hetzner/main.tf | 2 - infra/terraform/hetzner/outputs.tf | 4 - infra/terraform/hetzner/variables.tf | 13 +- scripts/hetzner/bootstrap-secrets.env.example | 8 +- scripts/hetzner/bootstrap.sh | 2 +- scripts/hetzner/configure-cloudflare-dns.sh | 26 +- scripts/hetzner/destroy.sh | 1 - 22 files changed, 415 insertions(+), 733 deletions(-) delete mode 100644 deploy/k8s/platform/base/ingress-nginx.yaml create mode 100644 docs/hetzner-rebuild-pipeline.md diff --git a/.env.example b/.env.example index 3a968f1..040b24b 100644 --- a/.env.example +++ b/.env.example @@ -36,4 +36,4 @@ EXECUTOR_STATE_DIR=/var/lib/unrip/executor-state # - optional DNS provider creds via *_PASS or direct env vars # # Future k3s deployment should source the app values from Kubernetes Secret/ConfigMap. -# Hetzner bootstrap path clones the repo to /opt/unrip/repo for later deploy/k8s assets. +# Hetzner provisioning is workstation-driven after Terraform; cloud-init no longer clones this repo onto the node. diff --git a/README.md b/README.md index ecce6f1..03a6b24 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,6 @@ src/ compose.yml Dockerfile docs/contracts.md -deploy/hetzner/README.md ``` ## Event flow @@ -69,136 +68,65 @@ Current topics: - `cmd.execute_trade` - `exec.trade_result` -## Primary deployment path: repo-driven Hetzner bootstrap +## Canonical deployment path -The primary production path is no longer a Compose-only VM workflow. +The canonical production path is the repo-driven Hetzner + k3s bootstrap flow. +Compose still exists for local development and optional single-machine testing, but it is not the primary production story. -The intended operating model is: -- Terraform provisions a Hetzner single-node environment -- cloud-init installs k3s automatically on first boot -- a local operator workstation performs the first repo-driven bootstrap -- Kubernetes manifests install Redpanda, the app workloads, Forgejo, runner, registry, and ingress-related components -- once the in-cluster Git + CI stack is alive, routine app deploys move to self-hosted CI - -This is a two-phase model: -- **Phase 0:** local workstation bootstrap of a brand-new cluster -- **Phase 1:** self-hosted Forgejo + runner takes over app delivery - -Compose still exists for local development and optional single-machine testing, but it is not the canonical production story. - -## Prerequisites for first deployment - -Install locally on the operator workstation: -- Terraform `>= 1.6` -- `kubectl` -- `docker` -- `curl` - -You also need: -- a Hetzner Cloud API token -- a local SSH public key file for Terraform node provisioning -- DNS control for your chosen base domain and Forgejo hostname -- preferably a Tailscale tailnet and auth key for private admin/control-plane access -- the repo checked out locally - -## Required bootstrap secrets and inputs - -Create the bootstrap env file: - -```bash -cp scripts/hetzner/bootstrap-secrets.env.example scripts/hetzner/bootstrap-secrets.env -``` - -Set at least: -- `HCLOUD_TOKEN` -- `SSH_PUBLIC_KEY_PATH` -- `PUBLIC_DOMAIN` -- recommended: - - `TAILSCALE_AUTH_KEY` - - `TAILSCALE_CONTROL_PLANE_HOSTNAME` -- optional fallback: - - `TF_ADMIN_CIDR_BLOCKS` -- `BASE_DOMAIN` -- `FORGEJO_DOMAIN` -- `FORGEJO_ROOT_URL` -- `REGISTRY_DOMAIN` -- `LETSENCRYPT_EMAIL` -- `REGISTRY_USERNAME` -- `REGISTRY_PASSWORD` -- `NEAR_INTENTS_API_KEY` -- `FORGEJO_RUNNER_REGISTRATION_TOKEN` -- optional DNS automation: - - Cloudflare: - - `CLOUDFLARE_API_TOKEN` - - `CLOUDFLARE_ZONE_ID` - - Porkbun: - - `PORKBUN_API_KEY` - - `PORKBUN_SECRET_API_KEY` - -Then load them: - -```bash -source scripts/hetzner/bootstrap-secrets.env -``` - -## First bootstrap sequence - -Run the end-to-end bootstrap from repo root: - -```bash -bash scripts/hetzner/bootstrap.sh -``` - -Current repo behavior of that script: -1. runs Terraform in `infra/terraform/hetzner` -2. optionally creates DNS records for the base, Forgejo, and registry hosts via Cloudflare or Porkbun -3. if configured, joins the node to Tailscale and prefers the Tailscale control-plane hostname for Kubernetes API access -4. waits for SSH and the k3s API endpoint to become ready -5. fetches the real k3s kubeconfig from the node and writes it to `.state/hetzner/kubeconfig.yaml` -6. renders the Hetzner single-node overlay from local operator inputs -7. creates registry pull/auth secrets -8. applies the Kubernetes bootstrap manifests -9. builds the app image locally and imports it into k3s on the node -10. performs the first rollout using the imported bootstrap image - -Use the generated kubeconfig afterward: - -```bash -export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml -kubectl get nodes -o wide -kubectl get pods -A -kubectl -n unrip get deploy,pods -kubectl -n forgejo get deploy,pods,svc -``` - -## What is deployed into k3s - -The repo-managed Kubernetes assets are under `deploy/k8s/`. - -Current single-node target includes resources for: +Current single-node cluster stack includes: - `unrip` workloads in namespace `unrip` - Redpanda - Forgejo - Forgejo runner - private registry -- ingress-nginx namespace/resources -- cert-manager namespace/resources -- ACME issuers and ingress definitions -- a bootstrap job for Redpanda topic creation +- cert-manager +- Traefik via the k3s bundled ingress controller +- Grafana +- Loki +- Promtail +- Headlamp -Shared platform namespaces: -- `forgejo` -- `registry` -- `ingress-nginx` -- `cert-manager` +### Bootstrap entrypoint -Project-specific namespaces: -- `unrip` -- future projects should get their own namespace rather than sharing `unrip` +```bash +cp scripts/hetzner/bootstrap-secrets.env.example scripts/hetzner/bootstrap-secrets.env +source scripts/hetzner/bootstrap-secrets.env +bash scripts/hetzner/bootstrap.sh +``` -Important current-state nuance: -- the bootstrap script currently applies `deploy/k8s/base` -- the longer-term intended target is `deploy/k8s/overlays/hetzner-single-node` +The bootstrap script now: +1. provisions or updates Hetzner infra with Terraform +2. optionally manages DNS via Cloudflare or Porkbun +3. prefers Tailscale for admin/control-plane access when configured +4. fetches kubeconfig from the node into `.state/hetzner/kubeconfig.yaml` +5. renders `.state/hetzner/generated-overlay/` from repo manifests plus local secrets +6. applies platform and project resources to k3s +7. bootstraps Forgejo admin, runner, repo, and Actions configuration +8. seeds this repo into Forgejo +9. lets Forgejo Actions perform the default build/push/deploy path +10. stores the generated Headlamp login token in `pass` when `HEADLAMP_ADMIN_TOKEN_PASS` is configured + +Detailed bootstrap and destroy documentation lives in: +- `docs/hetzner-k3s-bootstrap.md` +- `docs/hetzner-self-hosted-ci-runbook.md` +- `docs/k8s-observability.md` +- `deploy/hetzner/README.md` +- `deploy/k8s/README.md` +- `deploy/k8s/overlays/hetzner-single-node/README.md` + +### Runtime surfaces + +- Forgejo: `https://git.doran.133011.xyz/` +- Registry: `https://registry.doran.133011.xyz/` +- Grafana: `https://grafana.doran.133011.xyz/` +- Headlamp: `https://headlamp.doran.133011.xyz/` + +### Operator notes + +- Ingress is Traefik-based. The old ingress-nginx path is obsolete. +- Grafana is for historical log search. +- Headlamp is for browsing workloads, pods, events, and pod logs. +- Use `pass`-backed `*_PASS` variables for secrets whenever possible. ## Executor persistence in k3s @@ -208,106 +136,12 @@ Current persistence boundary: - app env uses `EXECUTOR_STATE_DIR=/var/lib/unrip/executor-state` - in Kubernetes, the executor deployment mounts storage at that path - the Hetzner single-node overlay pins storage to the k3s `local-path` storage class -- cloud-init also prepares the host directory boundary for executor state on first boot Operational meaning: - executor state lives on node-backed storage in the single-node k3s environment - if that PVC or underlying node storage is lost, duplicate-suppression history is lost too - treat executor persistence as part of the minimal durable state of the cluster -## Failure recovery and operator checks - -### If bootstrap fails before Terraform completes -Re-run after fixing the local input problem: -- missing token -- invalid CIDRs -- invalid SSH public key path - -If the infrastructure must be torn down: - -```bash -source scripts/hetzner/bootstrap-secrets.env -bash scripts/hetzner/destroy.sh -``` - -### If Terraform succeeds but Kubernetes is not ready -Check the public API and cluster state from the workstation: - -```bash -export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml -kubectl get nodes -o wide -kubectl get pods -A -kubectl get events -A --sort-by=.lastTimestamp | tail -n 50 -``` - -Typical next checks: -- cloud-init may still be finishing -- k3s may still be starting -- a workload may be crash-looping due to missing secret values or image-delivery issues - -### If workloads do not roll out -Inspect the affected namespace: - -```bash -kubectl -n unrip get pods -kubectl -n unrip describe pod -kubectl -n unrip logs deploy/dummy-executor --tail=100 -kubectl -n forgejo logs deploy/forgejo --tail=100 -``` - -### If you need to recreate secrets -The workstation bootstrap creates these Secrets: -- `unrip/unrip-secrets` -- `forgejo/forgejo-secrets` - -Verify them: - -```bash -kubectl -n unrip get secret unrip-secrets -kubectl -n forgejo get secret forgejo-secrets -``` - -### Current known limitations -Current colony state already identified an important gap: -- bootstrap and CI are not yet fully production-hardened, even though the first deploy path now fetches the real kubeconfig and imports the bootstrap image directly into k3s - -Treat the current bootstrap as a repo-driven first-deploy path suitable for testing, with hardening still pending. - -## Self-hosted CI handoff - -After cluster bootstrap: -- open Forgejo at `https://${FORGEJO_DOMAIN}` -- seed or push this repo into Forgejo -- create Forgejo repository secrets: - - `KUBECONFIG_B64` - - `REGISTRY_USERNAME` - - `REGISTRY_PASSWORD` -- create Forgejo repository variables: - - `REGISTRY_HOST=${REGISTRY_DOMAIN}` - - optional: `PROJECT_NAME=unrip` - - optional: `PROJECT_NAMESPACE=unrip` - - optional: `PROJECT_DEPLOYMENTS=near-intents-ingest,dummy-reactor,dummy-executor,dummy-consumer` -- push to `main` - -Routine application deploys then follow `.forgejo/workflows/deploy.yml`: -- build image as `REGISTRY_HOST/PROJECT_NAME:${GIT_SHA}` -- push to the private registry -- `kubectl set image` for each deployment listed in `PROJECT_DEPLOYMENTS` inside `PROJECT_NAMESPACE` -- wait for rollout - -If project variables are omitted, the workflow defaults to the current repo project: -- `PROJECT_NAME=unrip` -- `PROJECT_NAMESPACE=unrip` -- `PROJECT_DEPLOYMENTS=near-intents-ingest,dummy-reactor,dummy-executor,dummy-consumer` - -Infrastructure changes remain Terraform-driven from the operator workstation unless and until that responsibility is also automated. - -For the detailed operator runbooks, see: -- `docs/hetzner-k3s-bootstrap.md` -- `docs/hetzner-self-hosted-ci-runbook.md` -- `deploy/k8s/projects/README.md` -- `docs/next-session-architecture.md` - ## Local development with Compose Compose remains available for local development and debugging. @@ -365,4 +199,4 @@ KAFKA_TOPIC_EXEC_TRADE_RESULT=exec.trade_result KAFKA_CONSUMER_GROUP_DUMMY=dummy-reactor-v1 KAFKA_CONSUMER_GROUP_EXECUTOR=dummy-executor-v1 EXECUTOR_STATE_DIR=/var/lib/unrip/executor-state -``` \ No newline at end of file +``` diff --git a/deploy/hetzner/README.md b/deploy/hetzner/README.md index 381321d..668c696 100644 --- a/deploy/hetzner/README.md +++ b/deploy/hetzner/README.md @@ -1,275 +1,105 @@ -# Hetzner single-node bootstrap (Terraform + cloud-init + k3s) +# Hetzner single-node bootstrap -This is the canonical first-production deployment path for the repo. +This repo’s canonical infrastructure path is: -A local operator workstation drives the first deployment end to end: -- Terraform provisions Hetzner infrastructure -- cloud-init installs k3s automatically on first boot -- the workstation waits for the public Kubernetes API -- the workstation creates initial Kubernetes Secrets -- the workstation applies repo-managed Kubernetes manifests -- the workstation performs the first image/bootstrap delivery attempt -- once Forgejo + runner are alive, routine app deploys are intended to move to self-hosted CI +1. provision one Hetzner VM with Terraform +2. let cloud-init install k3s (and optionally Tailscale) +3. run `scripts/hetzner/bootstrap.sh` from the operator workstation +4. apply repo-managed platform + project manifests +5. bootstrap Forgejo, the runner, repo secrets/variables, and the first CI-driven deploy -Compose remains available for local development, but it is not the primary production deployment model. +## Source of truth -## Scope of this layer +Use these docs first: -The foundation under `infra/terraform/hetzner` provisions: -- one Hetzner Cloud server -- one SSH key resource based on your local public key -- firewall rules for SSH, Kubernetes API, and HTTP/HTTPS ingress -- a private network attachment for future growth -- cloud-init user-data for unattended k3s installation and host preparation +- `docs/hetzner-k3s-bootstrap.md` — bootstrap + destroy + required env +- `docs/hetzner-self-hosted-ci-runbook.md` — Forgejo/runner/CI flow +- `docs/k8s-observability.md` — Grafana, Loki, Promtail, Headlamp +- `deploy/k8s/README.md` — Kubernetes layout +- `deploy/k8s/overlays/hetzner-single-node/README.md` — overlay details -The repo bootstrap then applies the Hetzner single-node overlay under `deploy/k8s/overlays/hetzner-single-node`, which composes Kubernetes resources under `deploy/k8s/` for: -- shared platform namespaces and services -- Redpanda -- unrip workloads +## Current architecture + +Infrastructure under `infra/terraform/hetzner/` provisions: +- one Hetzner VM +- one firewall +- one private network attachment +- cloud-init for unattended k3s install + +Kubernetes platform services deployed from this repo: - Forgejo - Forgejo runner - private registry -- ingress/TLS-related resources -- Redpanda topic bootstrap job +- cert-manager +- Traefik via k3s bundled ingress controller +- Grafana +- Loki +- Promtail +- Headlamp -## Prerequisites +Project services deployed from this repo: +- Redpanda +- `near-intents-ingest` +- `dummy-reactor` +- `dummy-executor` +- `dummy-consumer` -Install on the operator workstation: -- Terraform `>= 1.6` -- `kubectl` -- `docker` -- `curl` +## Bootstrap model -You also need: -- a Hetzner Cloud API token -- an SSH keypair already present locally -- access to DNS for your chosen domains -- admin CIDRs that can reach the future server on `22/tcp` and `6443/tcp` -- this repo checked out locally +The current bootstrap is workstation-driven after Terraform. +cloud-init does **not** clone this repo onto the node. -## Required bootstrap secrets and inputs +`scripts/hetzner/bootstrap.sh` now: +- loads config and secrets from `scripts/hetzner/bootstrap-secrets.env` +- resolves `*_PASS` values through `pass` +- runs Terraform +- configures DNS through Cloudflare or Porkbun when credentials are present +- fetches kubeconfig from the node +- renders `.state/hetzner/generated-overlay/` +- applies platform + project manifests +- bootstraps Forgejo admin/user/repo/runner state +- seeds the repo into Forgejo +- lets Forgejo Actions perform the routine image build + deploy path by default -Prepare the operator env file: +Legacy local-image bootstrap still exists, but the default/steady-state path is Forgejo Actions. + +## Required operator inputs + +Create and source: ```bash cp scripts/hetzner/bootstrap-secrets.env.example scripts/hetzner/bootstrap-secrets.env -${EDITOR:-vi} scripts/hetzner/bootstrap-secrets.env -``` - -Set at least: -- `HCLOUD_TOKEN` -- `SSH_PUBLIC_KEY_PATH` -- `TF_ADMIN_CIDR_BLOCKS` -- `BASE_DOMAIN` -- `FORGEJO_DOMAIN` -- `FORGEJO_ROOT_URL` -- `NEAR_INTENTS_API_KEY` -- `FORGEJO_RUNNER_REGISTRATION_TOKEN` - -Load it into the current shell: - -```bash source scripts/hetzner/bootstrap-secrets.env ``` -## Canonical bootstrap sequence +At minimum you need: +- Hetzner credentials +- SSH public key path +- public domain settings +- registry credentials +- app secret(s) +- Forgejo admin credentials +- Grafana admin credentials -Run from repo root: +Recommended: +- Tailscale auth key for private admin/control-plane access +- DNS provider credentials +- `pass`-backed secret refs instead of raw env values -```bash -bash scripts/hetzner/bootstrap.sh -``` +## Current live/public surfaces -Current behavior of the script: -1. validates local tooling -2. runs `terraform init` and `terraform apply` in `infra/terraform/hetzner` -3. reads Terraform outputs such as server IP and `k3s_api_url` -4. waits for the k3s API readiness endpoint -5. writes a local workstation kubeconfig to `.state/hetzner/kubeconfig.yaml` -6. writes overlay secret env input files and creates: - - `unrip/unrip-secrets` - - `unrip/unrip-registry-creds` - - `forgejo/forgejo-secrets` - - `registry/registry-secrets` -7. applies `deploy/k8s/platform/base/namespace.yaml` and `deploy/k8s/overlays/hetzner-single-node` -8. builds the repo bootstrap image locally -9. pushes it through the temporary local registry bridge using the active project name -10. updates and waits for rollout status in the active project namespace +- Forgejo: `https://git.doran.133011.xyz/` +- Registry: `https://registry.doran.133011.xyz/` +- Grafana: `https://grafana.doran.133011.xyz/` +- Headlamp: `https://headlamp.doran.133011.xyz/` -After the script finishes: +## Notes -```bash -export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml -kubectl get nodes -o wide -kubectl get pods -A -kubectl -n unrip get deploy,pods,jobs -kubectl -n forgejo get deploy,pods,svc -kubectl -n registry get pods,svc -``` +- The Forgejo runner no longer reads a pre-seeded `runner_registration_token` from a secret. Bootstrap generates a one-time token in-cluster and persists the runner config on the Forgejo PVC. +- Registry auth is created imperatively during bootstrap from `REGISTRY_USERNAME` and `REGISTRY_PASSWORD`; manual overlay applies must provide `registry.htpasswd` themselves. +- Headlamp login uses a generated Kubernetes service-account token; bootstrap stores it in `pass` when `HEADLAMP_ADMIN_TOKEN_PASS` is configured. +- Ingress is Traefik-based. The old `ingress-nginx` path is obsolete. -## Current manifest target +## Status -Important current-state detail: -- `scripts/hetzner/bootstrap.sh` now applies `deploy/k8s/platform/base/namespace.yaml` -- it then applies `deploy/k8s/overlays/hetzner-single-node` -- bootstrap naming no longer assumes legacy `trading-system` kubeconfig contexts, image tags, or rollout namespaces - -## Executor persistence in k3s - -The dummy executor persists durable idempotency state. - -Current persistence model: -- application path: `EXECUTOR_STATE_DIR=/var/lib/unrip/executor-state` -- cloud-init prepares the host boundary for executor storage on first boot -- Kubernetes mounts storage at that same path for the executor workload -- the Hetzner single-node overlay pins PVC-backed storage to k3s `local-path` - -Operational consequence: -- executor duplicate-suppression state lives on node-backed persistent storage -- replacing the node or deleting the PVC without migration loses that history -- treat executor state as required operational data, even though the executor is still a dummy implementation - -## Failure recovery runbook - -### A. Bootstrap fails before infrastructure exists -Typical causes: -- invalid `HCLOUD_TOKEN` -- wrong `SSH_PUBLIC_KEY_PATH` -- malformed `TF_ADMIN_CIDR_BLOCKS` - -Fix the input and rerun: - -```bash -source scripts/hetzner/bootstrap-secrets.env -bash scripts/hetzner/bootstrap.sh -``` - -If you need to destroy partially created infrastructure: - -```bash -source scripts/hetzner/bootstrap-secrets.env -bash scripts/hetzner/destroy.sh -``` - -### B. Terraform succeeds but cluster access is not usable -Verify the generated kubeconfig and cluster health: - -```bash -export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml -kubectl get nodes -o wide -kubectl get pods -A -kubectl get events -A --sort-by=.lastTimestamp | tail -n 50 -``` - -What to suspect first: -- cloud-init still running -- k3s still starting -- bootstrap kubeconfig/auth not fully aligned yet -- public API reachable, but workloads not yet healthy - -### C. Secrets were wrong or missing -The current bootstrap depends on: -- `${PROJECT_NAME:-unrip}-secrets` - - `NEAR_INTENTS_API_KEY` -- `forgejo-secrets` - - `root_url` - - `domain` - - `runner_registration_token` - -Verify: - -```bash -kubectl -n unrip get secret unrip-secrets -kubectl -n unrip get secret unrip-registry-creds -kubectl -n forgejo get secret forgejo-secrets -kubectl -n registry get secret registry-secrets -``` - -If needed, recreate them from the workstation before restarting the affected deployments. - -### D. Workloads are present but not healthy -Inspect by namespace: - -```bash -kubectl -n unrip get pods -kubectl -n unrip describe pod -kubectl -n unrip logs deploy/dummy-executor --tail=100 -kubectl -n forgejo logs deploy/forgejo --tail=100 -kubectl -n forgejo logs deploy/forgejo-runner --tail=100 -``` - -Useful rollout checks: - -```bash -kubectl -n unrip rollout status deployment/near-intents-ingest --timeout=300s -kubectl -n unrip rollout status deployment/dummy-reactor --timeout=300s -kubectl -n unrip rollout status deployment/dummy-executor --timeout=300s -kubectl -n unrip rollout status deployment/dummy-consumer --timeout=300s -kubectl -n forgejo rollout status deployment/forgejo --timeout=300s -kubectl -n forgejo rollout status deployment/forgejo-runner --timeout=300s -``` - -### E. Need to inspect Terraform outputs directly - -```bash -cd infra/terraform/hetzner -terraform output -terraform output server_ipv4 -terraform output server_private_ipv4 -terraform output k3s_api_url -terraform output kubeconfig_strategy -``` - -## Self-hosted CI handoff - -After the cluster is reachable and workloads are up: -1. reach Forgejo at the configured domain or by port-forward -2. perform the initial admin/bootstrap steps in Forgejo -3. create the target repository in Forgejo -4. push or mirror this repo into that Forgejo instance -5. confirm the runner is registered and healthy -6. move routine application deploys to the self-hosted pipeline, which now derives image naming and rollout targets from Forgejo repository variables instead of hard-coding the legacy project - -Current repo-state caveats already known: -- first bootstrap is repo-driven from the workstation -- the bootstrap path no longer relies on SSH/scp transport in control flow -- the kubeconfig/auth result is not yet fully production-hardened -- first rollout still uses a temporary local registry bridge; routine CI deploys are intended to be registry-native and the Forgejo workflow now defaults to `unrip` while allowing per-repo overrides for image name, namespace, and deployment list -- Forgejo admin creation, repo creation, and Actions configuration still require operator action after cluster bring-up -- DNS automation is currently wired for Cloudflare when credentials are supplied during bootstrap -- TLS is expected to come from cert-manager + Let's Encrypt once ingress hostnames resolve publicly - -## Terraform-only usage - -If you only want the infra layer: - -```bash -cd infra/terraform/hetzner -export TF_VAR_hcloud_token="" -export TF_VAR_ssh_public_key="$(cat ~/.ssh/id_ed25519.pub)" -export TF_VAR_admin_cidr_blocks='["203.0.113.10/32"]' - -terraform init -terraform apply -``` - -Useful outputs: -- `server_ipv4` -- `server_private_ipv4` -- `server_name` -- `server_fqdn` -- `k3s_api_url` -- `kubeconfig_strategy` - -For CI/CD details, also see: -- `docs/hetzner-k3s-bootstrap.md` -- `docs/hetzner-self-hosted-ci-runbook.md` - -## Compose status - -Compose is still useful for: -- local development -- fast topology debugging -- non-production single-machine testing - -But it should be treated as optional/dev runtime support, not as the primary production deployment path. +This path has been rebuilt successfully and the cluster is operational, but if you want the strongest reproducibility guarantee after any new platform addition, do one more full destroy/rebuild rehearsal. diff --git a/deploy/hetzner/cloud-init.k3s-first-node.yaml b/deploy/hetzner/cloud-init.k3s-first-node.yaml index fbb8f9b..ac764df 100644 --- a/deploy/hetzner/cloud-init.k3s-first-node.yaml +++ b/deploy/hetzner/cloud-init.k3s-first-node.yaml @@ -4,7 +4,6 @@ package_upgrade: true packages: - ca-certificates - curl - - git - gnupg - jq - nfs-common @@ -58,17 +57,11 @@ write_files: BOOTSTRAP_PROJECT_NAME=unrip BOOTSTRAP_PROJECT_NAMESPACE=unrip K3S_KUBECONFIG=/opt/bootstrap/kubeconfig-internal.yaml - BOOTSTRAP_REPO_DIR=/opt/unrip/repo - BOOTSTRAP_MANIFEST_DIR=/opt/unrip/repo/deploy/k8s + BOOTSTRAP_MANIFEST_SOURCE=operator-workstation GITOPS_HANDOFF=seed-self-hosted-git-and-runner EOF chmod 0644 /usr/local/share/unrip/bootstrap-metadata.env - install -d -m 0755 /opt/unrip - if [ ! -d /opt/unrip/repo/.git ]; then - git clone --depth 1 ${BOOTSTRAP_REPO_URL:-https://example.invalid/bootstrap-repo.git} /opt/unrip/repo || true - fi - install -d -m 0755 /opt/bootstrap cp /etc/rancher/k3s/k3s.yaml /opt/bootstrap/kubeconfig-internal.yaml chmod 0640 /opt/bootstrap/kubeconfig-internal.yaml @@ -79,7 +72,7 @@ write_files: This node was provisioned by Terraform + cloud-init. Use /opt/bootstrap/kubeconfig-internal.yaml for automation. Bootstrap metadata lives at /usr/local/share/unrip/bootstrap-metadata.env. - Future Kubernetes bootstrap assets should live under /opt/unrip/repo/deploy/k8s. + Kubernetes bootstrap assets are applied from the operator workstation after provisioning. EOF chmod 0644 /opt/bootstrap/README.txt diff --git a/deploy/k8s/README.md b/deploy/k8s/README.md index bb83127..86daa54 100644 --- a/deploy/k8s/README.md +++ b/deploy/k8s/README.md @@ -13,9 +13,10 @@ Shared platform namespaces: - `forgejo` - `registry` - `observability` (`grafana`, `loki`, `promtail`, `headlamp`) -- `ingress-nginx` - `cert-manager` +Ingress is provided by the Traefik controller bundled with k3s. Base and overlay manifests therefore target `ingressClassName: traefik` instead of installing ingress-nginx. + Project-specific namespaces: - `unrip` - future projects should get their own namespace instead of sharing `unrip` @@ -27,7 +28,9 @@ After Terraform/cloud-init has produced a working kubeconfig, the canonical path bash scripts/hetzner/bootstrap.sh ``` -That script renders the Hetzner overlay inputs, creates platform and project registry auth secrets using the active project naming, and applies: +That script renders the Hetzner overlay inputs, creates platform and project registry auth secrets using the active project naming, and applies the generated bootstrap overlay under `.state/hetzner/generated-overlay/`. + +For a manual, fully checked-in apply path, use: ```bash kubectl apply -k deploy/k8s/overlays/hetzner-single-node @@ -41,4 +44,4 @@ The overlay intentionally references generated or pre-created Secrets instead of - `observability/observability-secrets` - `registry/registry-secrets` -The bootstrap script creates them from local environment variables. By default it targets the `unrip` project, but its kubeconfig context name, bootstrap image tag, project secret env filename, project namespace, and project registry secret name are derived from `PROJECT_NAME`, `PROJECT_NAMESPACE`, and `CLUSTER_NAME` instead of hard-coding legacy `trading-system` values. +The bootstrap script creates them from local environment variables and `pass`-resolved secrets. By default it targets the `unrip` project, but project secret env filenames, namespaces, image names, rollout targets, and registry pull-secret names are derived from `PROJECT_NAME` and `PROJECT_NAMESPACE` instead of hard-coding legacy `trading-system` values. diff --git a/deploy/k8s/overlays/hetzner-single-node/README.md b/deploy/k8s/overlays/hetzner-single-node/README.md index b50ee63..5f78a4b 100644 --- a/deploy/k8s/overlays/hetzner-single-node/README.md +++ b/deploy/k8s/overlays/hetzner-single-node/README.md @@ -2,34 +2,106 @@ This overlay turns the shared platform and `unrip` project bases into a concrete first-node bootstrap target for the Terraform-provisioned k3s VM. -## Before apply -Create real secret material from the examples: +The checked-in overlay is the declarative template. For first-cluster bootstrap, `scripts/hetzner/bootstrap.sh` renders a generated overlay under `.state/hetzner/generated-overlay/` and applies that generated copy as the source of truth for the run. + +## Two ways to use this overlay + +### 1. Recommended: `scripts/hetzner/bootstrap.sh` +This is the intended operator workflow for a fresh Hetzner cluster. The bootstrap script renders secret and patch inputs from local env and `pass`, creates imperative registry secrets, and applies a generated Kustomize overlay. + +That generated overlay now imports the platform resources from `deploy/k8s/platform/base/kustomization.yaml`, so new checked-in platform components such as observability manifests are included automatically during bootstrap instead of being silently skipped by a hard-coded file list. + +Bootstrap overwrites these operator-worktree files on each run: +- `deploy/k8s/overlays/hetzner-single-node/secrets/unrip.env` +- `deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env` +- `deploy/k8s/overlays/hetzner-single-node/secrets/observability.env` + +Bootstrap also renders and applies generated copies of these patch files under `.state/hetzner/generated-overlay/` instead of modifying the checked-in overlay files directly: +- `ingress-hosts.patch.yaml` +- `issuer-email.patch.yaml` +- `storage-class.patch.yaml` + +Secret/config sources when using bootstrap: +- from `pass` or direct env overrides via `scripts/hetzner/bootstrap-secrets.env`: + - `HCLOUD_TOKEN` + - `TAILSCALE_AUTH_KEY` + - `CLOUDFLARE_API_TOKEN` + - `CLOUDFLARE_ZONE_ID` + - `PORKBUN_API_KEY` + - `PORKBUN_SECRET_API_KEY` + - `REGISTRY_PASSWORD` + - `NEAR_INTENTS_API_KEY` + - `FORGEJO_ADMIN_PASSWORD` + - optional `GRAFANA_ADMIN_PASSWORD` (bootstrap generates one if omitted) +- from plain env/non-secret config in `scripts/hetzner/bootstrap-secrets.env`: + - `PUBLIC_DOMAIN`, `BASE_DOMAIN`, `FORGEJO_DOMAIN`, `FORGEJO_ROOT_URL`, `REGISTRY_DOMAIN`, `GRAFANA_DOMAIN`, `GRAFANA_ROOT_URL`, `HEADLAMP_DOMAIN` + - default hostname model under `PUBLIC_DOMAIN`: `git.${PUBLIC_DOMAIN}`, `registry.${PUBLIC_DOMAIN}`, `grafana.${PUBLIC_DOMAIN}`, `headlamp.${PUBLIC_DOMAIN}` + - `LETSENCRYPT_EMAIL` + - `REGISTRY_USERNAME` + - `FORGEJO_ADMIN_USERNAME`, `FORGEJO_ADMIN_EMAIL` + - optional `GRAFANA_ADMIN_USERNAME` (defaults to `admin`) + - optional project overrides such as `PROJECT_NAME`, `PROJECT_NAMESPACE`, and `PROJECT_SECRET_ENV_BASENAME` + +Bootstrap materializes Kubernetes inputs like this: +- `secrets/unrip.env` gets `NEAR_INTENTS_API_KEY` +- `secrets/forgejo.env` gets only `root_url` and `domain` +- `secrets/observability.env` gets `grafana_admin_user`, `grafana_admin_password`, and `grafana_root_url` +- generated overlay Kustomize secret generators create `observability-secrets` in namespace `observability` alongside the project and Forgejo secrets +- `registry-secrets` in namespace `registry` is created imperatively from `REGISTRY_USERNAME` and `REGISTRY_PASSWORD` +- `-registry-creds` image pull secret is created imperatively in the project namespace from the same registry credentials + +Note: the Forgejo runner no longer reads `runner_registration_token` from `forgejo-secrets`. `scripts/hetzner/bootstrap.sh` generates a one-time runner token in-cluster, registers the runner, and writes `/data/forgejo-runner/.runner` on the shared Forgejo PVC before restarting the runner deployment. + +### 2. Manual: `kubectl apply -k` +Use this only if you intentionally want to manage the checked-in overlay inputs yourself. In manual mode, the checked-in overlay remains the source of truth; in bootstrap mode, the generated overlay is the source of truth for what gets applied. + +Before apply, create or edit real local input files: ```bash cp deploy/k8s/overlays/hetzner-single-node/secrets/unrip.env.example deploy/k8s/overlays/hetzner-single-node/secrets/unrip.env cp deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env.example deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env +cp deploy/k8s/overlays/hetzner-single-node/secrets/observability.env.example deploy/k8s/overlays/hetzner-single-node/secrets/observability.env cp deploy/k8s/overlays/hetzner-single-node/secrets/registry.htpasswd.example deploy/k8s/overlays/hetzner-single-node/secrets/registry.htpasswd ``` -Update: -- ingress hosts in `ingress-hosts.patch.yaml` +Then update: +- ingress hosts in `ingress-hosts.patch.yaml` for Forgejo, Registry, Grafana, and Headlamp - ACME email in `issuer-email.patch.yaml` - project secret values in `secrets/unrip.env` -- Forgejo secret values in `secrets/forgejo.env` -- registry htpasswd in `secrets/registry.htpasswd` +- Forgejo secret values in `secrets/forgejo.env` (`root_url` and `domain` only) +- observability secret values in `secrets/observability.env` (`grafana_admin_user`, `grafana_admin_password`, `grafana_root_url`) + +Important manual-mode caveat: +- `kubectl apply -k deploy/k8s/overlays/hetzner-single-node` creates only the Kustomize-managed secrets from the checked-in files (`unrip-secrets`, `forgejo-secrets`, `observability-secrets`, and `registry-secrets` when `secrets/registry.htpasswd` exists) +- it does **not** create the project docker-registry pull secret +- if you skip `scripts/hetzner/bootstrap.sh`, you must create that pull secret separately before expecting image pulls or CI builds to work ## Apply +Bootstrap path: +```bash +bash scripts/hetzner/bootstrap.sh +``` + +Manual path: ```bash kubectl apply -k deploy/k8s/overlays/hetzner-single-node ``` ## What gets installed -- shared platform namespaces for registry, ingress, cert-manager, and Forgejo +- shared platform namespaces for registry, ingress, cert-manager, Forgejo, and observability - project namespace `unrip` - Redpanda plus a topic bootstrap job inside `unrip` - app worker deployments referencing `unrip-secrets` - Forgejo and Forgejo runner referencing `forgejo-secrets` -- private registry protected by htpasswd from `registry-secrets` +- private registry workload, which still requires the imperative `registry-secrets` auth secret to be created separately unless you used `scripts/hetzner/bootstrap.sh` - nginx ingress and ACME issuers for TLS +- observability ingress for Grafana and Headlamp, plus local-path PVC overrides for Grafana and Loki + +## Observability UI exposure policy +- Grafana and Headlamp are both wired into the Hetzner ingress/domain model. +- Use `grafana.${PUBLIC_DOMAIN}` / `headlamp.${PUBLIC_DOMAIN}` or explicit `GRAFANA_DOMAIN` / `HEADLAMP_DOMAIN` values. +- Grafana is the historical log search UI backed by Loki. +- Headlamp is the Kubernetes cluster UI for workloads, events, and pod logs. +- Grafana is authenticated through `observability-secrets`; Headlamp is authenticated with the generated Kubernetes service-account token that bootstrap stores in `pass` when `HEADLAMP_ADMIN_TOKEN_PASS` is configured. For future projects, do not reuse `unrip`; create a new project namespace and matching `-config`, `-secrets`, and `-registry-creds` resources. diff --git a/deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env.example b/deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env.example index cf698eb..e877222 100644 --- a/deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env.example +++ b/deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env.example @@ -1,3 +1,2 @@ root_url=https://git.unrip-bootstrap.example.com/ domain=git.unrip-bootstrap.example.com -runner_registration_token=replace-me diff --git a/deploy/k8s/platform/base/forgejo-runner.yaml b/deploy/k8s/platform/base/forgejo-runner.yaml index a4d9db2..610de7c 100644 --- a/deploy/k8s/platform/base/forgejo-runner.yaml +++ b/deploy/k8s/platform/base/forgejo-runner.yaml @@ -15,6 +15,20 @@ spec: spec: serviceAccountName: forgejo-runner restartPolicy: Always + initContainers: + - name: wait-for-runner-config + image: busybox:1.36 + command: ["/bin/sh", "-ec"] + args: + - >- + until [ -s /data/.runner ]; do + echo "waiting for bootstrap to write /data/.runner"; + sleep 5; + done + volumeMounts: + - name: forgejo-data + mountPath: /data + subPath: forgejo-runner containers: - name: runner image: code.forgejo.org/forgejo/runner:6.3.1 @@ -22,26 +36,18 @@ spec: runAsUser: 0 runAsGroup: 0 env: - - name: FORGEJO_INSTANCE_URL - valueFrom: - secretKeyRef: - name: forgejo-secrets - key: root_url - - name: FORGEJO_RUNNER_REGISTRATION_TOKEN - valueFrom: - secretKeyRef: - name: forgejo-secrets - key: runner_registration_token + - name: FORGEJO_RUNNER_CONFIG + value: /data/.runner command: ["/bin/sh", "-lc"] args: - >- - if [ ! -f /data/.runner ]; then - forgejo-runner register --no-interactive --name k3s-runner --instance "$FORGEJO_INSTANCE_URL" --token "$FORGEJO_RUNNER_REGISTRATION_TOKEN" --labels "linux-amd64:host"; - fi && - forgejo-runner daemon --config /data/.runner + test -s "$FORGEJO_RUNNER_CONFIG" && + forgejo-runner daemon --config "$FORGEJO_RUNNER_CONFIG" volumeMounts: - - name: runner-data + - name: forgejo-data mountPath: /data + subPath: forgejo-runner volumes: - - name: runner-data - emptyDir: {} + - name: forgejo-data + persistentVolumeClaim: + claimName: forgejo-data diff --git a/deploy/k8s/platform/base/ingress-nginx.yaml b/deploy/k8s/platform/base/ingress-nginx.yaml deleted file mode 100644 index 51bd042..0000000 --- a/deploy/k8s/platform/base/ingress-nginx.yaml +++ /dev/null @@ -1,73 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: ingress-nginx-controller - namespace: ingress-nginx -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: ingress-nginx - app.kubernetes.io/component: controller - template: - metadata: - labels: - app.kubernetes.io/name: ingress-nginx - app.kubernetes.io/component: controller - spec: - serviceAccountName: default - containers: - - name: controller - image: registry.k8s.io/ingress-nginx/controller:v1.12.1 - args: - - /nginx-ingress-controller - - --ingress-class=nginx - - --controller-class=k8s.io/ingress-nginx - - --publish-service=$(POD_NAMESPACE)/ingress-nginx-controller - - --election-id=ingress-nginx-leader - - --enable-ssl-passthrough - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - ports: - - name: http - containerPort: 80 - - name: https - containerPort: 443 - securityContext: - allowPrivilegeEscalation: true - capabilities: - add: ["NET_BIND_SERVICE"] - drop: ["ALL"] - readinessProbe: - httpGet: - path: /healthz - port: 10254 - livenessProbe: - httpGet: - path: /healthz - port: 10254 ---- -apiVersion: v1 -kind: Service -metadata: - name: ingress-nginx-controller - namespace: ingress-nginx -spec: - type: LoadBalancer - selector: - app.kubernetes.io/name: ingress-nginx - app.kubernetes.io/component: controller - ports: - - name: http - port: 80 - targetPort: 80 - - name: https - port: 443 - targetPort: 443 diff --git a/deploy/k8s/platform/base/namespace.yaml b/deploy/k8s/platform/base/namespace.yaml index c556251..2242eb8 100644 --- a/deploy/k8s/platform/base/namespace.yaml +++ b/deploy/k8s/platform/base/namespace.yaml @@ -22,18 +22,13 @@ metadata: --- apiVersion: v1 kind: Namespace -metadata: - name: ingress-nginx - labels: - project.pi.io/type: platform ---- -apiVersion: v1 -kind: Namespace metadata: name: cert-manager labels: project.pi.io/type: platform --- +# Ingress is provided by the Traefik controller bundled with k3s. +# No separate ingress-nginx namespace is created by this base. apiVersion: v1 kind: Namespace metadata: diff --git a/deploy/k8s/projects/README.md b/deploy/k8s/projects/README.md index 96690a6..2113bda 100644 --- a/deploy/k8s/projects/README.md +++ b/deploy/k8s/projects/README.md @@ -6,14 +6,17 @@ This cluster is intended to host multiple independent projects. - shared platform namespaces: - `forgejo` - `registry` - - `ingress-nginx` + - `observability` - `cert-manager` +- shared ingress model: + - use the k3s-bundled Traefik controller + - project Ingress resources should set `ingressClassName: traefik` - per-project namespaces: - `unrip` - future examples: `project-foo`, `project-bar` ## How to add another project -For each new project, create a project manifest set similar to `deploy/k8s/base/unrip.yaml`: +For each new project, create a project manifest set similar to `deploy/k8s/projects/unrip/base/`: - one namespace - one project config map - one secret name unique to the project @@ -32,4 +35,4 @@ Recommended naming convention: ## Current project in this repo - project name: `unrip` - namespace: `unrip` -- project manifest: `deploy/k8s/base/unrip.yaml` +- project manifest: `deploy/k8s/projects/unrip/base/` diff --git a/docs/bootstrap-status-report.md b/docs/bootstrap-status-report.md index f4f2226..9edd155 100644 --- a/docs/bootstrap-status-report.md +++ b/docs/bootstrap-status-report.md @@ -1,105 +1,18 @@ -Status: partially successful, not fully healthy yet. +# Historical bootstrap status report -What worked -- Hetzner VM provisioned -- k3s installed and running -- node is `Ready` -- namespaces created -- Forgejo is up -- registry is up -- Redpanda is up -- `near-intents-ingest` is up +This file is retained only as an archive of an early, partially successful bootstrap attempt. +It does **not** describe the current cluster state or the current canonical bootstrap flow. -What is still broken -- `dummy-reactor`, `dummy-executor`, `dummy-consumer` are failing because Kafka/Redpanda topic metadata is not healthy yet: - - `This server does not host this topic-partition` -- ingress-nginx is crashing -- cert-manager webhook/cainjector are crashing -- so public HTTPS ingress is not ready -- therefore Git/registry/CI are not yet usable via domain names +For current operator documentation, use: +- `docs/hetzner-k3s-bootstrap.md` +- `docs/hetzner-self-hosted-ci-runbook.md` +- `docs/k8s-observability.md` +- `docs/hetzner-rebuild-pipeline.md` -So the honest report is: -- cluster bootstrap succeeded -- platform/app stack is only partially healthy -- we still need another fix pass before calling this “working” +Current reality has moved past the failures described in the old report: +- Traefik is the active ingress path +- cert-manager is healthy +- Forgejo, registry, Grafana, and Headlamp are reachable +- Forgejo Actions is the default deployment path -How to interact with it right now - -1. Use kubectl -```bash -export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml -kubectl get nodes -o wide -kubectl get pods -A -kubectl -n unrip get pods -kubectl -n forgejo get pods,svc -kubectl -n registry get pods,svc -``` - -2. Access Forgejo right now -Since ingress is broken, use port-forward: -```bash -kubectl -n forgejo port-forward svc/forgejo 3000:3000 -``` -Then open: -```text -http://127.0.0.1:3000 -``` - -3. Access the registry right now -Also via port-forward: -```bash -kubectl -n registry port-forward svc/registry 5000:5000 -``` -Then from your machine: -```bash -docker login 127.0.0.1:5000 -u unrip -``` -And push/pull like: -```bash -docker tag unrip:bootstrap 127.0.0.1:5000/unrip:test -docker push 127.0.0.1:5000/unrip:test -``` - -4. Watch logs -```bash -kubectl -n unrip logs deploy/near-intents-ingest -f -kubectl -n unrip logs deploy/dummy-reactor -f -kubectl -n unrip logs deploy/dummy-executor -f -kubectl -n unrip logs deploy/dummy-consumer -f -kubectl -n forgejo logs deploy/forgejo -f -kubectl -n registry logs deploy/registry -f -``` - -How Git would work once Forgejo is usable -After port-forward or later ingress: -```bash -git remote add forgejo http://127.0.0.1:3000//.git -git push forgejo main -``` - -How CI/CD is supposed to work -Intended flow: -1. code lives in Forgejo -2. Forgejo runner executes `.forgejo/workflows/deploy.yml` -3. workflow builds image -4. pushes image to registry -5. updates `unrip` deployments in Kubernetes - -Current reality: -- not ready yet -- because ingress/cert-manager are unhealthy -- and we haven’t verified a full Forgejo runner deploy cycle - -Bottom line -- Kubernetes cluster: yes -- server provisioning: yes -- basic platform pieces: partially -- usable Git/CI/CD stack: not yet -- unrip app pipeline: not yet - -Most important next fixes -1. fix k3s manifest/platform issues: - - ingress-nginx RBAC/crash - - cert-manager install/CRDs/RBAC -2. fix Redpanda/topic metadata issue so reactor/executor/consumer run -3. only then wire Forgejo + registry + CI as usable +If you need a historical failure log, use Git history for earlier revisions of this file. diff --git a/docs/hetzner-k3s-bootstrap.md b/docs/hetzner-k3s-bootstrap.md index 321c2af..0e2a0e7 100644 --- a/docs/hetzner-k3s-bootstrap.md +++ b/docs/hetzner-k3s-bootstrap.md @@ -102,6 +102,7 @@ Required values: - `FORGEJO_ADMIN_PASSWORD_PASS` or `FORGEJO_ADMIN_PASSWORD` - `GRAFANA_ADMIN_USERNAME` (defaults to `admin`) - `GRAFANA_ADMIN_PASSWORD_PASS` or `GRAFANA_ADMIN_PASSWORD` +- optional `HEADLAMP_ADMIN_TOKEN_PASS` for storing the generated Headlamp login token back into `pass` - optional repo settings: `FORGEJO_REPO_OWNER`, `FORGEJO_REPO_NAME`, `FORGEJO_REPO_PRIVATE` Optional for automatic DNS: @@ -127,6 +128,8 @@ Outputs: - overlay secrets and ingress host patches rendered from local env / `pass` - `.state/hetzner/generated-overlay/` rendered and applied as the canonical bootstrap manifest set for that run - namespaces, Redpanda, app deployments, Forgejo, registry, Traefik-targeted ingress resources, cert-manager, issuers, and any additional platform resources referenced by `deploy/k8s/platform/base/kustomization.yaml` applied +- Headlamp is deployed and wired to the configured public hostname model +- bootstrap stores the generated Headlamp service-account token in `pass` when `HEADLAMP_ADMIN_TOKEN_PASS` is configured - Forgejo admin account created automatically if missing - Forgejo runner registration is generated automatically from inside the Forgejo pod and the resulting `/data/.runner` config is stored under the shared `forgejo-data` persistent volume used by the runner deployment - Forgejo repository created automatically in either the admin user's namespace or a pre-existing organization named by `FORGEJO_REPO_OWNER` @@ -155,7 +158,7 @@ Supported scripted providers: - Porkbun TLS is handled in-cluster by cert-manager using Let's Encrypt issuers and the rendered ingress hosts. -Grafana is the default observability UI wired into the public hostname model. Keep Grafana authenticated. +Grafana and Headlamp are both wired into the public hostname model by default. Keep Grafana authenticated, and treat the Headlamp token as an operator credential. The platform base assumes the default k3s Traefik ingress controller is present; it does not install ingress-nginx. For clean-cluster applies, the base kustomization now includes cert-manager before the `ClusterIssuer` resources so the issuer CRs can be created in the same bootstrap flow. @@ -214,7 +217,7 @@ bash scripts/hetzner/destroy.sh `destroy.sh` reads `HCLOUD_TOKEN`, optional `TAILSCALE_AUTH_KEY`, optional DNS provider credentials, and optional Forgejo admin credentials via the same `*_PASS` mapping mechanism as bootstrap. It uses the same Terraform inputs as bootstrap for the infrastructure resources, then can optionally: -- delete the scripted DNS records for `${BASE_DOMAIN}`, `git.${BASE_DOMAIN}`, `registry.${BASE_DOMAIN}`, and `grafana.${BASE_DOMAIN}` +- delete the scripted DNS records for `${PUBLIC_DOMAIN}`, `git.${PUBLIC_DOMAIN}`, `registry.${PUBLIC_DOMAIN}`, `grafana.${PUBLIC_DOMAIN}`, and `headlamp.${PUBLIC_DOMAIN}` - remove local bootstrap artifacts under `.state/hetzner/`, `deploy/k8s/overlays/hetzner-single-node/generated/`, and the local Terraform working/state files in `infra/terraform/hetzner/` - delete the bootstrap-managed Forgejo repository via the Forgejo API diff --git a/docs/hetzner-rebuild-pipeline.md b/docs/hetzner-rebuild-pipeline.md new file mode 100644 index 0000000..1ea3dcb --- /dev/null +++ b/docs/hetzner-rebuild-pipeline.md @@ -0,0 +1,117 @@ +# Hetzner rebuild pipeline map + +This document summarizes the currently intended rebuild flow for the repo-driven Hetzner single-node cluster. + +It is a companion to the operator runbooks, not a competing source of truth. +Use these first for exact commands and required env: + +- `docs/hetzner-k3s-bootstrap.md` +- `docs/hetzner-self-hosted-ci-runbook.md` +- `docs/k8s-observability.md` + +## High-level rebuild sequence + +1. prepare `scripts/hetzner/bootstrap-secrets.env` +2. source it so `*_PASS` mappings resolve through `pass` +3. optionally run `scripts/hetzner/destroy.sh` +4. run `scripts/hetzner/bootstrap.sh` +5. let bootstrap: + - provision/update Hetzner infra with Terraform + - configure DNS when provider credentials are present + - fetch the real kubeconfig from the node + - render `.state/hetzner/generated-overlay/` + - apply platform + project manifests + - bootstrap Forgejo admin, runner, repo, and Actions configuration + - seed the repo into Forgejo + - trigger the normal Forgejo Actions build/push/deploy path +6. verify public/operator surfaces: + - Forgejo + - registry + - Grafana + - Headlamp +7. verify workload health and CI success + +## Ownership boundaries + +### Terraform owns +- Hetzner VM +- network +- firewall +- cloud-init user data + +### Cloud-init owns +- OS package prep +- optional Tailscale join +- k3s installation +- a marker file under `/opt/unrip/bootstrap/README.txt` + +Cloud-init does **not** clone this repo or apply Kubernetes manifests. + +### Bootstrap script owns +- `pass`-resolved secret loading +- DNS automation +- kubeconfig retrieval/rendering +- generated overlay rendering under `.state/hetzner/generated-overlay/` +- imperative registry auth secret creation +- Forgejo bootstrap API calls +- repo seeding +- Headlamp token export to `pass` + +### Kubernetes manifests own +- platform services +- project services +- ingress/TLS resources +- observability stack +- persistent volume claims and workload specs + +## Current default runtime model + +Platform services: +- Forgejo +- Forgejo runner +- registry +- cert-manager +- Grafana +- Loki +- Promtail +- Headlamp + +Project services: +- Redpanda +- `near-intents-ingest` +- `dummy-reactor` +- `dummy-executor` +- `dummy-consumer` + +Ingress/controller model: +- Traefik bundled with k3s +- no ingress-nginx in the active path + +## Rebuild verification checklist + +After bootstrap, verify: + +```bash +export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml +kubectl get nodes -o wide +kubectl get pods -A +kubectl -n observability get deploy,ds,pods,svc,ingress,secrets +kubectl -n forgejo get deploy,pods,svc,ingress +kubectl -n registry get deploy,pods,svc,ingress +kubectl -n unrip get deploy,pods +``` + +Public/operator surfaces should respond: +- `https://git./` +- `https://registry./v2/` +- `https://grafana./` +- `https://headlamp./` + +CI should show a successful deploy workflow in Forgejo Actions. + +## Current caveat + +The core Hetzner/k3s/Forgejo path has been rebuilt successfully before. +Headlamp was added afterward and validated live on the rebuilt cluster, but a brand-new destroy/rebuild rehearsal with Headlamp included has not yet been re-run from zero. + +So the rebuild story is repo-driven and operationally close to fully reproducible, with one remaining value-add validation step: a final clean-room rebuild after the latest Headlamp/docs cleanup. diff --git a/infra/terraform/hetzner/cloud-init.yaml.tftpl b/infra/terraform/hetzner/cloud-init.yaml.tftpl index 936fe3d..eb4dc06 100644 --- a/infra/terraform/hetzner/cloud-init.yaml.tftpl +++ b/infra/terraform/hetzner/cloud-init.yaml.tftpl @@ -19,20 +19,17 @@ write_files: #!/usr/bin/env bash set -euo pipefail - install -d -m 0755 /opt/unrip - if [ ! -d /opt/unrip/repo/.git ]; then - git clone --branch ${bootstrap_repo_branch} ${bootstrap_repo_url} /opt/unrip/repo - else - git -C /opt/unrip/repo fetch --all --prune - git -C /opt/unrip/repo checkout ${bootstrap_repo_branch} - git -C /opt/unrip/repo pull --ff-only origin ${bootstrap_repo_branch} - fi - install -d -m 0755 /opt/unrip/bootstrap cat >/opt/unrip/bootstrap/README.txt <<'EOF' This node was provisioned by Terraform + cloud-init. - Future Kubernetes bootstrap assets should live in: - /opt/unrip/repo/${bootstrap_repo_path} + This cloud-init step no longer clones a bootstrap repository. + The current Hetzner flow remains workstation-driven after Terraform: + - scripts/hetzner/bootstrap.sh fetches kubeconfig from the node + - scripts/hetzner/bootstrap.sh renders secrets/overlays locally + - scripts/hetzner/bootstrap.sh applies Kubernetes manifests from the operator workstation + + Reserved for future node-local bootstrap/GitOps assets: + /opt/unrip/bootstrap/${bootstrap_repo_path} EOF - path: /etc/rancher/k3s/config.yaml permissions: '0644' diff --git a/infra/terraform/hetzner/main.tf b/infra/terraform/hetzner/main.tf index 93f5b55..04a7397 100644 --- a/infra/terraform/hetzner/main.tf +++ b/infra/terraform/hetzner/main.tf @@ -38,8 +38,6 @@ resource "hcloud_server" "trading_system" { node_name = var.name private_ipv4_address = var.private_ipv4_address public_domain = var.public_domain - bootstrap_repo_url = var.bootstrap_repo_url - bootstrap_repo_branch = var.bootstrap_repo_branch bootstrap_repo_path = var.bootstrap_repo_path tailscale_enabled = var.tailscale_enabled tailscale_auth_key = var.tailscale_auth_key diff --git a/infra/terraform/hetzner/outputs.tf b/infra/terraform/hetzner/outputs.tf index 745b574..d3e7e4d 100644 --- a/infra/terraform/hetzner/outputs.tf +++ b/infra/terraform/hetzner/outputs.tf @@ -26,10 +26,6 @@ output "kubeconfig_strategy" { value = var.tailscale_enabled ? "Use Tailscale for private Kubernetes API access; avoid public SSH/Kubernetes exposure in the canonical flow." : "Use the public Kubernetes API endpoint with an operator-supplied bootstrap credential; avoid SSH/scp kubeconfig retrieval in the canonical flow." } -output "bootstrap_repo_checkout" { - value = "/opt/unrip/repo" -} - output "bootstrap_marker_file" { value = "/opt/unrip/bootstrap/README.txt" } diff --git a/infra/terraform/hetzner/variables.tf b/infra/terraform/hetzner/variables.tf index 64ec6e6..9c0e6fd 100644 --- a/infra/terraform/hetzner/variables.tf +++ b/infra/terraform/hetzner/variables.tf @@ -93,19 +93,8 @@ variable "public_domain" { type = string } -variable "bootstrap_repo_url" { - description = "Git repository URL cloned onto the node for GitOps/bootstrap assets" - type = string -} - -variable "bootstrap_repo_branch" { - description = "Branch checked out for the bootstrap repository" - type = string - default = "main" -} - variable "bootstrap_repo_path" { - description = "Repository subdirectory expected to contain future Kubernetes bootstrap manifests/scripts" + description = "Reserved repository subdirectory name for a future node-local bootstrap/GitOps flow; current provisioning still applies manifests from the operator workstation" type = string default = "deploy/k8s" } diff --git a/scripts/hetzner/bootstrap-secrets.env.example b/scripts/hetzner/bootstrap-secrets.env.example index f1d75cc..8f811f8 100644 --- a/scripts/hetzner/bootstrap-secrets.env.example +++ b/scripts/hetzner/bootstrap-secrets.env.example @@ -57,7 +57,6 @@ export FORGEJO_ROOT_URL="${FORGEJO_ROOT_URL:-https://${FORGEJO_DOMAIN}/}" export REGISTRY_DOMAIN="${REGISTRY_DOMAIN:-registry.${PUBLIC_DOMAIN}}" export GRAFANA_DOMAIN="${GRAFANA_DOMAIN:-grafana.${PUBLIC_DOMAIN}}" export GRAFANA_ROOT_URL="${GRAFANA_ROOT_URL:-https://${GRAFANA_DOMAIN}/}" -export HEADLAMP_DOMAIN="${HEADLAMP_DOMAIN:-headlamp.${PUBLIC_DOMAIN}}" export LETSENCRYPT_EMAIL="${LETSENCRYPT_EMAIL:-ops@example.com}" # Optional DNS automation: choose one provider @@ -85,10 +84,13 @@ export FORGEJO_ADMIN_PASSWORD_PASS="${FORGEJO_ADMIN_PASSWORD_PASS:-$(pass_ref fo export GRAFANA_ADMIN_USERNAME="${GRAFANA_ADMIN_USERNAME:-admin}" export GRAFANA_ADMIN_PASSWORD_PASS="${GRAFANA_ADMIN_PASSWORD_PASS:-$(pass_ref grafana/admin-password)}" -# Optional storage path for the generated Headlamp admin login token. -# Bootstrap writes the in-cluster token here after Headlamp is available. export HEADLAMP_ADMIN_TOKEN_PASS="${HEADLAMP_ADMIN_TOKEN_PASS:-$(pass_ref headlamp/admin-token)}" +# Headlamp bootstrap token handling: +# - bootstrap stores the generated token in HEADLAMP_ADMIN_TOKEN_PASS when set +# - the current default public hostname is HEADLAMP_DOMAIN +# - for a stricter posture, you can still keep Headlamp private behind Tailscale or another admin path + # Optional explicit overrides for CI/testing: # export HCLOUD_TOKEN="..." # export REGISTRY_PASSWORD="..." diff --git a/scripts/hetzner/bootstrap.sh b/scripts/hetzner/bootstrap.sh index e78931d..efc3f3a 100755 --- a/scripts/hetzner/bootstrap.sh +++ b/scripts/hetzner/bootstrap.sh @@ -395,7 +395,7 @@ for attempt in $(seq 1 60); do sleep 2 done if [[ -z "$HEADLAMP_ADMIN_TOKEN" ]]; then - echo "warning: headlamp admin token not available yet; rerun bootstrap or read secret headlamp-admin-token manually" >&2 + echo "warning: headlamp admin token not available yet; read secret headlamp-admin-token manually if needed" >&2 elif [[ -n "${HEADLAMP_ADMIN_TOKEN_PASS:-}" ]]; then store_secret_to_pass "$HEADLAMP_ADMIN_TOKEN_PASS" "$HEADLAMP_ADMIN_TOKEN" echo "stored headlamp admin token in pass: $HEADLAMP_ADMIN_TOKEN_PASS" diff --git a/scripts/hetzner/configure-cloudflare-dns.sh b/scripts/hetzner/configure-cloudflare-dns.sh index 2c8ee3b..9612ec3 100755 --- a/scripts/hetzner/configure-cloudflare-dns.sh +++ b/scripts/hetzner/configure-cloudflare-dns.sh @@ -62,22 +62,28 @@ records=( "headlamp.$PUBLIC_DOMAIN" ) +ROOT_RECORD="${records[0]}" +GIT_RECORD="${records[1]}" +REGISTRY_RECORD="${records[2]}" +GRAFANA_RECORD="${records[3]}" +HEADLAMP_RECORD="${records[4]}" + case "$DNS_MODE" in upsert) : "${SERVER_IP:?set SERVER_IP}" - upsert_record A "${records[0]}" "$SERVER_IP" false - upsert_record A "${records[1]}" "$SERVER_IP" false - upsert_record A "${records[2]}" "$SERVER_IP" false - upsert_record A "${records[3]}" "$SERVER_IP" false - upsert_record A "${records[4]}" "$SERVER_IP" false + upsert_record A "$ROOT_RECORD" "$SERVER_IP" false + upsert_record A "$GIT_RECORD" "$SERVER_IP" false + upsert_record A "$REGISTRY_RECORD" "$SERVER_IP" false + upsert_record A "$GRAFANA_RECORD" "$SERVER_IP" false + upsert_record A "$HEADLAMP_RECORD" "$SERVER_IP" false echo "cloudflare dns updated for ${records[*]}" ;; delete) - delete_record A "${records[0]}" - delete_record A "${records[1]}" - delete_record A "${records[2]}" - delete_record A "${records[3]}" - delete_record A "${records[4]}" + delete_record A "$ROOT_RECORD" + delete_record A "$GIT_RECORD" + delete_record A "$REGISTRY_RECORD" + delete_record A "$GRAFANA_RECORD" + delete_record A "$HEADLAMP_RECORD" echo "cloudflare dns cleanup finished for ${records[*]}" ;; *) diff --git a/scripts/hetzner/destroy.sh b/scripts/hetzner/destroy.sh index 9b580c4..f2c9b54 100755 --- a/scripts/hetzner/destroy.sh +++ b/scripts/hetzner/destroy.sh @@ -38,7 +38,6 @@ TF_VARS=( -var "hcloud_token=$HCLOUD_TOKEN" -var "ssh_public_key=$SSH_PUBLIC_KEY" -var "public_domain=$PUBLIC_DOMAIN" - -var "bootstrap_repo_url=local-bootstrap" -var "tailscale_auth_key=${TAILSCALE_AUTH_KEY:-}" -var "tailscale_control_plane_hostname=$TAILSCALE_CONTROL_PLANE_HOSTNAME" )