feat: add cluster log aggregation with grafana

This commit is contained in:
Philipp 2026-03-29 00:38:24 +01:00
parent 3c05c7f7e8
commit 61b973cccb
17 changed files with 840 additions and 44 deletions

View file

@ -12,6 +12,7 @@ This directory is the repo-driven deployment target for the single-node Hetzner+
Shared platform namespaces:
- `forgejo`
- `registry`
- `observability`
- `ingress-nginx`
- `cert-manager`
@ -37,6 +38,7 @@ The overlay intentionally references generated or pre-created Secrets instead of
- `unrip/unrip-secrets`
- `unrip/unrip-registry-creds`
- `forgejo/forgejo-secrets`
- `observability/observability-secrets`
- `registry/registry-secrets`
The bootstrap script creates them from local environment variables. By default it targets the `unrip` project, but its kubeconfig context name, bootstrap image tag, project secret env filename, project namespace, and project registry secret name are derived from `PROJECT_NAME`, `PROJECT_NAMESPACE`, and `CLUSTER_NAME` instead of hard-coding legacy `trading-system` values.

View file

@ -41,3 +41,25 @@ spec:
name: registry
port:
number: 5000
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana
namespace: observability
spec:
tls:
- hosts:
- grafana.doran.133011.xyz
secretName: grafana-tls
rules:
- host: grafana.doran.133011.xyz
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: grafana
port:
number: 3000

View file

@ -16,6 +16,10 @@ secretGenerator:
namespace: forgejo
envs:
- secrets/forgejo.env
- name: observability-secrets
namespace: observability
envs:
- secrets/observability.env
- name: registry-secrets
namespace: registry
files:

View file

@ -0,0 +1,3 @@
grafana_admin_user=admin
grafana_admin_password=replace-me
grafana_root_url=https://grafana.example.invalid/

View file

@ -29,3 +29,19 @@ metadata:
namespace: registry
spec:
storageClassName: local-path
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: loki-data
namespace: observability
spec:
storageClassName: local-path
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-data
namespace: observability
spec:
storageClassName: local-path

View file

@ -47,3 +47,28 @@ spec:
name: registry
port:
number: 5000
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana
namespace: observability
annotations:
cert-manager.io/cluster-issuer: letsencrypt-production
spec:
ingressClassName: traefik
tls:
- hosts:
- grafana.example.invalid
secretName: grafana-tls
rules:
- host: grafana.example.invalid
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: grafana
port:
number: 3000

View file

@ -3,6 +3,7 @@ kind: Kustomization
resources:
- namespace.yaml
- traefik-config.yaml
- observability.yaml
- forgejo.yaml
- forgejo-rbac.yaml
- forgejo-runner.yaml

View file

@ -33,3 +33,10 @@ metadata:
name: cert-manager
labels:
project.pi.io/type: platform
---
apiVersion: v1
kind: Namespace
metadata:
name: observability
labels:
project.pi.io/type: platform

View file

@ -0,0 +1,451 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: loki-config
namespace: observability
labels:
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observability
project.pi.io/type: platform
data:
config.yaml: |
auth_enabled: false
server:
http_listen_port: 3100
common:
path_prefix: /var/loki
replication_factor: 1
ring:
kvstore:
store: inmemory
storage:
filesystem:
chunks_directory: /var/loki/chunks
rules_directory: /var/loki/rules
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
storage_config:
filesystem:
directory: /var/loki/chunks
limits_config:
allow_structured_metadata: false
reject_old_samples: true
reject_old_samples_max_age: 168h
retention_period: 168h
compactor:
working_directory: /var/loki/compactor
retention_enabled: true
delete_request_store: filesystem
analytics:
reporting_enabled: false
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: loki-data
namespace: observability
labels:
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observability
project.pi.io/type: platform
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 20Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: loki
namespace: observability
labels:
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observability
project.pi.io/type: platform
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: loki
template:
metadata:
labels:
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observability
spec:
securityContext:
fsGroup: 10001
containers:
- name: loki
image: grafana/loki:3.0.0
args:
- -config.file=/etc/loki/config.yaml
ports:
- name: http
containerPort: 3100
readinessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 30
periodSeconds: 15
volumeMounts:
- name: config
mountPath: /etc/loki
readOnly: true
- name: data
mountPath: /var/loki
volumes:
- name: config
configMap:
name: loki-config
- name: data
persistentVolumeClaim:
claimName: loki-data
---
apiVersion: v1
kind: Service
metadata:
name: loki
namespace: observability
labels:
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observability
project.pi.io/type: platform
spec:
selector:
app.kubernetes.io/name: loki
ports:
- name: http
port: 3100
targetPort: http
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: promtail
namespace: observability
labels:
app.kubernetes.io/name: promtail
app.kubernetes.io/part-of: observability
project.pi.io/type: platform
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: promtail
labels:
app.kubernetes.io/name: promtail
app.kubernetes.io/part-of: observability
project.pi.io/type: platform
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
- namespaces
verbs:
- get
- list
- watch
- apiGroups: ["discovery.k8s.io"]
resources:
- endpointslices
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: promtail
labels:
app.kubernetes.io/name: promtail
app.kubernetes.io/part-of: observability
project.pi.io/type: platform
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: promtail
subjects:
- kind: ServiceAccount
name: promtail
namespace: observability
---
apiVersion: v1
kind: ConfigMap
metadata:
name: promtail-config
namespace: observability
labels:
app.kubernetes.io/name: promtail
app.kubernetes.io/part-of: observability
project.pi.io/type: platform
data:
config.yaml: |
server:
http_listen_port: 3101
grpc_listen_port: 0
positions:
filename: /run/promtail/positions.yaml
clients:
- url: http://loki.observability.svc.cluster.local:3100/loki/api/v1/push
scrape_configs:
- job_name: kubernetes-pods
kubernetes_sd_configs:
- role: pod
pipeline_stages:
- cri: {}
relabel_configs:
- source_labels:
- __meta_kubernetes_pod_node_name
target_label: __host__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- action: replace
source_labels:
- __meta_kubernetes_pod_container_name
target_label: container
- action: replace
source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_pod_name
separator: /
replacement: $1
target_label: job
- action: replace
source_labels:
- __meta_kubernetes_pod_uid
- __meta_kubernetes_pod_container_name
separator: /
replacement: /var/log/pods/*$1/*.log
target_label: __path__
- action: replace
source_labels:
- __meta_kubernetes_pod_annotationpresent_kubernetes_io_config_hash
- __meta_kubernetes_pod_annotation_kubernetes_io_config_hash
- __meta_kubernetes_pod_container_name
regex: true/(.*)
separator: /
replacement: /var/log/pods/*$1/*.log
target_label: __path__
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: promtail
namespace: observability
labels:
app.kubernetes.io/name: promtail
app.kubernetes.io/part-of: observability
project.pi.io/type: platform
spec:
selector:
matchLabels:
app.kubernetes.io/name: promtail
template:
metadata:
labels:
app.kubernetes.io/name: promtail
app.kubernetes.io/part-of: observability
spec:
serviceAccountName: promtail
containers:
- name: promtail
image: grafana/promtail:3.0.0
args:
- -config.file=/etc/promtail/config.yaml
env:
- name: HOSTNAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
ports:
- name: http
containerPort: 3101
securityContext:
runAsUser: 0
runAsGroup: 0
volumeMounts:
- name: config
mountPath: /etc/promtail
readOnly: true
- name: run
mountPath: /run/promtail
- name: varlog
mountPath: /var/log
readOnly: true
volumes:
- name: config
configMap:
name: promtail-config
- name: run
emptyDir: {}
- name: varlog
hostPath:
path: /var/log
type: Directory
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-datasources
namespace: observability
labels:
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: observability
project.pi.io/type: platform
data:
loki.yaml: |
apiVersion: 1
datasources:
- name: Loki
type: loki
access: proxy
url: http://loki.observability.svc.cluster.local:3100
isDefault: true
editable: false
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-data
namespace: observability
labels:
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: observability
project.pi.io/type: platform
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 5Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: observability
labels:
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: observability
project.pi.io/type: platform
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: grafana
template:
metadata:
labels:
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: observability
spec:
securityContext:
fsGroup: 472
containers:
- name: grafana
image: grafana/grafana:11.0.0
env:
- name: GF_SECURITY_ADMIN_USER
valueFrom:
secretKeyRef:
name: observability-secrets
key: grafana_admin_user
- name: GF_SECURITY_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: observability-secrets
key: grafana_admin_password
- name: GF_AUTH_ANONYMOUS_ENABLED
value: "false"
- name: GF_USERS_ALLOW_SIGN_UP
value: "false"
- name: GF_EXPLORE_ENABLED
value: "true"
- name: GF_SERVER_ROOT_URL
valueFrom:
secretKeyRef:
name: observability-secrets
key: grafana_root_url
ports:
- name: http
containerPort: 3000
readinessProbe:
httpGet:
path: /api/health
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /api/health
port: http
initialDelaySeconds: 30
periodSeconds: 15
volumeMounts:
- name: data
mountPath: /var/lib/grafana
- name: datasources
mountPath: /etc/grafana/provisioning/datasources
readOnly: true
volumes:
- name: data
persistentVolumeClaim:
claimName: grafana-data
- name: datasources
configMap:
name: grafana-datasources
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: observability
labels:
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: observability
project.pi.io/type: platform
spec:
selector:
app.kubernetes.io/name: grafana
ports:
- name: http
port: 3000
targetPort: 3000

View file

@ -1,6 +1,6 @@
# Required Kubernetes secrets
Base manifests and the Hetzner single-node overlay both expect secrets to be supplied out-of-band. The Hetzner overlay generates `unrip/unrip-secrets`, `forgejo/forgejo-secrets`, and `registry/registry-secrets` from local files.
Base manifests and the Hetzner single-node overlay both expect secrets to be supplied out-of-band. The Hetzner overlay generates `unrip/unrip-secrets`, `forgejo/forgejo-secrets`, `observability/observability-secrets`, and `registry/registry-secrets` from local files.
## Required secrets
- `unrip/unrip-secrets`
@ -8,9 +8,12 @@ Base manifests and the Hetzner single-node overlay both expect secrets to be sup
- `forgejo/forgejo-secrets`
- `root_url`
- `domain`
- `runner_registration_token`
- `registry/registry-secrets`
- `htpasswd`
- `observability/observability-secrets`
- `grafana_admin_user`
- `grafana_admin_password`
- `grafana_root_url`
## Overlay-driven generation
The `deploy/k8s/overlays/hetzner-single-node` overlay can generate these from local files via `secretGenerator`.
@ -20,10 +23,13 @@ Example workflow:
```bash
cp deploy/k8s/overlays/hetzner-single-node/secrets/unrip.env.example deploy/k8s/overlays/hetzner-single-node/secrets/unrip.env
cp deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env.example deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env
cp deploy/k8s/overlays/hetzner-single-node/secrets/observability.env.example deploy/k8s/overlays/hetzner-single-node/secrets/observability.env
cp deploy/k8s/overlays/hetzner-single-node/secrets/registry.htpasswd.example deploy/k8s/overlays/hetzner-single-node/secrets/registry.htpasswd
kubectl apply -k deploy/k8s/overlays/hetzner-single-node
```
The Forgejo runner no longer expects a pre-seeded `runner_registration_token` secret; `scripts/hetzner/bootstrap.sh` generates a one-time token in-cluster, registers the runner, stores the resulting `/data/.runner` config on the `forgejo-runner-data` PVC, and then restarts the deployment.
For future projects, follow the same convention with project-specific secret names in project-specific namespaces.
Do not commit populated secret files.

View file

@ -11,6 +11,7 @@ Goal: provision and deploy everything from this repo to a single Hetzner machine
- trading system services
- private registry
- Forgejo
- Loki + Promtail + Grafana observability
- k3s-bundled Traefik ingress resources
- cert-manager
- ACME issuers
@ -19,7 +20,8 @@ Goal: provision and deploy everything from this repo to a single Hetzner machine
- optionally creates DNS records via Cloudflare or Porkbun
- fetches the real kubeconfig from the node
- writes overlay secrets/host patches from local env
- applies the Hetzner single-node k8s overlay from the operator workstation checkout
- renders `.state/hetzner/generated-overlay/` from the checked-in Hetzner overlay template plus `deploy/k8s/platform/base/kustomization.yaml`
- applies that generated overlay from the operator workstation checkout
- builds the current app image locally
- imports the bootstrap image into k3s for the first rollout
@ -67,11 +69,14 @@ The mapping file should contain non-secret config plus `pass` entry references f
When you run `scripts/hetzner/bootstrap.sh`, it uses this file to materialize local Kubernetes inputs before apply:
- overwrites `deploy/k8s/overlays/hetzner-single-node/secrets/unrip.env` with `NEAR_INTENTS_API_KEY`
- overwrites `deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env` with Forgejo `root_url` and `domain`
- renders generated ingress and issuer patch files under `.state/hetzner/generated-overlay/`
- overwrites `deploy/k8s/overlays/hetzner-single-node/secrets/observability.env` with Grafana bootstrap credentials and root URL
- renders `.state/hetzner/generated-overlay/` as the bootstrap-time source of truth
- copies the checked-in overlay patch behavior into that generated overlay
- imports platform resources from `deploy/k8s/platform/base/kustomization.yaml`, so newly added platform modules such as observability manifests are included automatically
- creates `registry-secrets` in namespace `registry` from `REGISTRY_USERNAME` and `REGISTRY_PASSWORD`
- creates the project docker-registry pull secret in `PROJECT_NAMESPACE` from the same registry credentials
This is different from running `kubectl apply -k deploy/k8s/overlays/hetzner-single-node` manually: plain Kustomize apply only consumes the checked-in overlay files and only generates `unrip-secrets` and `forgejo-secrets`. It does not create registry auth secrets and does not read `scripts/hetzner/bootstrap-secrets.env` on its own.
This is different from running `kubectl apply -k deploy/k8s/overlays/hetzner-single-node` manually: plain Kustomize apply only consumes the checked-in overlay files, while bootstrap applies the generated overlay copy. Manual apply still only reads the checked-in files and does not read `scripts/hetzner/bootstrap-secrets.env` or create the imperative registry auth secrets on its own.
Required values:
- `HCLOUD_TOKEN_PASS` or `HCLOUD_TOKEN`
@ -85,6 +90,8 @@ Required values:
- `FORGEJO_DOMAIN`
- `FORGEJO_ROOT_URL`
- `REGISTRY_DOMAIN`
- `GRAFANA_DOMAIN`
- `GRAFANA_ROOT_URL`
- `LETSENCRYPT_EMAIL`
- `REGISTRY_USERNAME`
- `REGISTRY_PASSWORD_PASS` or `REGISTRY_PASSWORD`
@ -92,6 +99,8 @@ Required values:
- `FORGEJO_ADMIN_USERNAME`
- `FORGEJO_ADMIN_EMAIL`
- `FORGEJO_ADMIN_PASSWORD_PASS` or `FORGEJO_ADMIN_PASSWORD`
- `GRAFANA_ADMIN_USERNAME` (defaults to `admin`)
- `GRAFANA_ADMIN_PASSWORD_PASS` or `GRAFANA_ADMIN_PASSWORD`
- optional repo settings: `FORGEJO_REPO_OWNER`, `FORGEJO_REPO_NAME`, `FORGEJO_REPO_PRIVATE`
Optional for automatic DNS:
@ -115,7 +124,8 @@ Outputs:
- kubeconfig written to `.state/hetzner/kubeconfig.yaml`
- CI kubeconfig written to `.state/hetzner/kubeconfig.incluster.yaml`
- overlay secrets and ingress host patches rendered from local env / `pass`
- namespaces, Redpanda, app deployments, Forgejo, registry, Traefik-targeted ingress resources, cert-manager, and issuers applied
- `.state/hetzner/generated-overlay/` rendered and applied as the canonical bootstrap manifest set for that run
- namespaces, Redpanda, app deployments, Forgejo, registry, Traefik-targeted ingress resources, cert-manager, issuers, and any additional platform resources referenced by `deploy/k8s/platform/base/kustomization.yaml` applied
- Forgejo admin account created automatically if missing
- Forgejo runner registration is generated automatically from inside the Forgejo pod and the resulting `/data/.runner` config is stored under the shared `forgejo-data` persistent volume used by the runner deployment
- Forgejo repository created automatically in either the admin user's namespace or a pre-existing organization named by `FORGEJO_REPO_OWNER`
@ -133,15 +143,17 @@ Recommended mode:
## DNS and TLS
If DNS provider credentials are present, bootstrap updates:
- `${BASE_DOMAIN}`
- `git.${BASE_DOMAIN}`
- `registry.${BASE_DOMAIN}`
- `${PUBLIC_DOMAIN}`
- `git.${PUBLIC_DOMAIN}`
- `registry.${PUBLIC_DOMAIN}`
- `grafana.${PUBLIC_DOMAIN}`
Supported scripted providers:
- Cloudflare
- Porkbun
TLS is handled in-cluster by cert-manager using Let's Encrypt issuers and the rendered ingress hosts.
Grafana is the default observability UI wired into the public hostname model. Keep Grafana authenticated.
The platform base assumes the default k3s Traefik ingress controller is present; it does not install ingress-nginx.
For clean-cluster applies, the base kustomization now includes cert-manager before the `ClusterIssuer` resources so the issuer CRs can be created in the same bootstrap flow.
@ -151,6 +163,8 @@ KUBECONFIG=.state/hetzner/kubeconfig.yaml kubectl get pods -A
bash scripts/k8s/logs.sh
```
For the web log UI and observability stack, see `docs/k8s-observability.md`.
## Self-hosted CI/CD handoff
Default bootstrap now automates the Forgejo handoff:
1. create the Forgejo repo in the admin namespace or in a pre-existing organization named by `FORGEJO_REPO_OWNER`
@ -198,7 +212,7 @@ bash scripts/hetzner/destroy.sh
`destroy.sh` reads `HCLOUD_TOKEN`, optional `TAILSCALE_AUTH_KEY`, optional DNS provider credentials, and optional Forgejo admin credentials via the same `*_PASS` mapping mechanism as bootstrap.
It uses the same Terraform inputs as bootstrap for the infrastructure resources, then can optionally:
- delete the scripted DNS records for `${BASE_DOMAIN}`, `git.${BASE_DOMAIN}`, and `registry.${BASE_DOMAIN}`
- delete the scripted DNS records for `${BASE_DOMAIN}`, `git.${BASE_DOMAIN}`, `registry.${BASE_DOMAIN}`, and `grafana.${BASE_DOMAIN}`
- remove local bootstrap artifacts under `.state/hetzner/`, `deploy/k8s/overlays/hetzner-single-node/generated/`, and the local Terraform working/state files in `infra/terraform/hetzner/`
- delete the bootstrap-managed Forgejo repository via the Forgejo API

View file

@ -22,6 +22,7 @@ After that you should have:
- repository Actions secrets/variables populated for CI
- the current repo pushed to Forgejo automatically in default mode
- Registry reachable at `https://${REGISTRY_DOMAIN}`
- Grafana reachable at `https://${GRAFANA_DOMAIN}`
- private admin/control-plane access over Tailscale if configured
Bootstrap repo automation requires `FORGEJO_ADMIN_USERNAME`, `FORGEJO_ADMIN_PASSWORD`, Python `PyYAML` locally for kubeconfig rendering, and Python `PyNaCl` locally in the default `forgejo-actions` mode so the script can encrypt Forgejo Actions secrets before upload. Bootstrap now fails fast with an explicit preflight error if those Python modules are missing. The same bootstrap flow now also creates the initial Forgejo admin account and writes a durable `/data/.runner` config into the shared Forgejo PVC before the runner deployment is allowed to start.
@ -38,6 +39,7 @@ kubectl get nodes -o wide
kubectl get pods -A
kubectl -n forgejo get deploy,pods,svc,ingress
kubectl -n registry get deploy,pods,svc,ingress
kubectl -n observability get deploy,ds,pods,svc,ingress
kubectl -n unrip get deploy,pods
```
@ -130,6 +132,8 @@ Likewise, generated local kubeconfigs/manifests remain on disk unless you set `D
TLS is issued by cert-manager using the rendered Let's Encrypt email and ingress hosts.
For log inspection in the browser, use Grafana/Loki as documented in `docs/k8s-observability.md`.
## Current limitations
- the bootstrap path now creates the initial admin account and runner config automatically from inside the Forgejo pod, but it still depends on the operator supplying the intended admin credentials up front
- runner startup is now manifest-gated on a durable `/data/.runner` file stored under the shared `forgejo-data` PVC, so fresh applies no longer depend on a broken intermediate secret or a race against a crashing runner pod; deleting that Forgejo PVC still requires rerunning bootstrap to re-register the runner

137
docs/k8s-observability.md Normal file
View file

@ -0,0 +1,137 @@
# Kubernetes observability on the Hetzner single-node cluster
This cluster now includes a minimal reproducible log stack in the `observability` namespace:
- `loki` for log storage and querying
- `promtail` as a DaemonSet that ships pod stdout/stderr logs from every node
- `grafana` as the web UI
## What gets collected
Promtail tails Kubernetes container log files under `/var/log/pods` on each node.
That means any container writing logs to stdout/stderr automatically shows up in Loki/Grafana.
This fits the current app setup in this repo because the services already log to stdout/stderr.
What is **not** collected automatically:
- arbitrary log files written somewhere else inside a container filesystem
- logs from external services that are not running as Kubernetes pods on this cluster
## Access
Grafana is exposed through Traefik + cert-manager at:
- `https://${GRAFANA_DOMAIN}` when bootstrapped from `scripts/hetzner/bootstrap-secrets.env`
- in the current live environment: `https://grafana.doran.133011.xyz/`
Admin credentials come from:
- `GRAFANA_ADMIN_USERNAME`
- `GRAFANA_ADMIN_PASSWORD_PASS` or `GRAFANA_ADMIN_PASSWORD`
The recommended path is `pass`.
In the current live setup the password is stored at:
- `api/hetznerk3s/grafana-admin-password`
## Reproducible bootstrap path
The observability stack is part of the repo-managed platform layer:
- `deploy/k8s/platform/base/observability.yaml`
- `deploy/k8s/platform/base/kustomization.yaml`
- `deploy/k8s/platform/base/namespace.yaml`
- `deploy/k8s/overlays/hetzner-single-node/storage-class.patch.yaml`
- `deploy/k8s/overlays/hetzner-single-node/kustomization.yaml`
- `deploy/k8s/overlays/hetzner-single-node/ingress-hosts.patch.yaml`
- `deploy/k8s/overlays/hetzner-single-node/secrets/observability.env.example`
Bootstrap materializes the Grafana secret from local env / `pass`:
- writes `deploy/k8s/overlays/hetzner-single-node/secrets/observability.env`
- copies it into `.state/hetzner/generated-overlay/`
- applies the generated overlay
## Verify the stack
```bash
export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml
kubectl -n observability get pods
kubectl -n observability get pvc
kubectl -n observability get ingress
kubectl -n observability rollout status deployment/loki --timeout=300s
kubectl -n observability rollout status deployment/grafana --timeout=300s
kubectl -n observability rollout status daemonset/promtail --timeout=300s
```
## Verify logs are arriving
Generate some app logs, then query Loki directly:
```bash
export KUBECONFIG=$PWD/.state/hetzner/kubeconfig.yaml
kubectl -n observability port-forward svc/loki 3100:3100
```
In another shell:
```bash
curl -sS 'http://127.0.0.1:3100/loki/api/v1/labels' | jq
curl -G -sS 'http://127.0.0.1:3100/loki/api/v1/query' \
--data-urlencode 'query={namespace="unrip"}' | jq
```
If those queries return labels/streams, pod logs are reaching Loki.
## Use Grafana
After logging into Grafana:
1. open **Explore**
2. choose the default **Loki** datasource
3. run queries like:
- `{namespace="unrip"}`
- `{namespace="forgejo"}`
- `{namespace="registry"}`
- `{pod=~"near-intents-ingest.*"}`
- `{container="app"}`
Useful labels added by promtail:
- `namespace`
- `pod`
- `container`
- `app`
- selected `app.kubernetes.io/*` labels
## Day-to-day ops
CLI remains useful for fast debugging:
```bash
kubectl get pods -A
kubectl -n unrip logs deploy/near-intents-ingest -f
kubectl -n forgejo logs deploy/forgejo -f
bash scripts/k8s/logs.sh
```
Use Grafana when you want:
- a browser UI
- historical log search
- multi-namespace filtering
- easier cross-pod inspection
## Security notes
Grafana is an admin/operator surface.
For this cluster it is publicly reachable behind Grafana login.
That is acceptable for this disposable single-node setup, but for a harder production posture prefer one of:
- Tailscale-only access
- ingress auth in front of Grafana
- SSO/OIDC
## Add a new app and have logs show up there
Nothing special is required as long as the new pod logs to stdout/stderr.
If you deploy a new app under Kubernetes and expose it through the usual manifests/Ingress flow, promtail will scrape its pod logs automatically.

View file

@ -11,6 +11,7 @@
# What bootstrap materializes from this file:
# - overwrites deploy/k8s/overlays/hetzner-single-node/secrets/unrip.env
# - overwrites deploy/k8s/overlays/hetzner-single-node/secrets/forgejo.env
# - overwrites deploy/k8s/overlays/hetzner-single-node/secrets/observability.env
# - renders generated ingress/issuer patches under .state/hetzner/generated-overlay/
# - creates registry-secrets and the project docker-registry pull secret imperatively
#
@ -51,9 +52,11 @@ export TF_ADMIN_CIDR_BLOCKS="${TF_ADMIN_CIDR_BLOCKS:-[]}"
# Public naming for ingress/TLS
export PUBLIC_DOMAIN="${PUBLIC_DOMAIN:-doran.133011.xyz}"
export BASE_DOMAIN="${BASE_DOMAIN:-133011.xyz}"
export FORGEJO_DOMAIN="${FORGEJO_DOMAIN:-git.${BASE_DOMAIN}}"
export FORGEJO_DOMAIN="${FORGEJO_DOMAIN:-git.${PUBLIC_DOMAIN}}"
export FORGEJO_ROOT_URL="${FORGEJO_ROOT_URL:-https://${FORGEJO_DOMAIN}/}"
export REGISTRY_DOMAIN="${REGISTRY_DOMAIN:-registry.${BASE_DOMAIN}}"
export REGISTRY_DOMAIN="${REGISTRY_DOMAIN:-registry.${PUBLIC_DOMAIN}}"
export GRAFANA_DOMAIN="${GRAFANA_DOMAIN:-grafana.${PUBLIC_DOMAIN}}"
export GRAFANA_ROOT_URL="${GRAFANA_ROOT_URL:-https://${GRAFANA_DOMAIN}/}"
export LETSENCRYPT_EMAIL="${LETSENCRYPT_EMAIL:-ops@example.com}"
# Optional DNS automation: choose one provider
@ -77,11 +80,16 @@ export FORGEJO_ADMIN_USERNAME="${FORGEJO_ADMIN_USERNAME:-forgejo-admin}"
export FORGEJO_ADMIN_EMAIL="${FORGEJO_ADMIN_EMAIL:-${FORGEJO_ADMIN_USERNAME}@${BASE_DOMAIN}}"
export FORGEJO_ADMIN_PASSWORD_PASS="${FORGEJO_ADMIN_PASSWORD_PASS:-$(pass_ref forgejo/admin-password)}"
# Grafana bootstrap auth for the public observability UI
export GRAFANA_ADMIN_USERNAME="${GRAFANA_ADMIN_USERNAME:-admin}"
export GRAFANA_ADMIN_PASSWORD_PASS="${GRAFANA_ADMIN_PASSWORD_PASS:-$(pass_ref grafana/admin-password)}"
# Optional explicit overrides for CI/testing:
# export HCLOUD_TOKEN="..."
# export REGISTRY_PASSWORD="..."
# export NEAR_INTENTS_API_KEY="..."
# export FORGEJO_ADMIN_PASSWORD="..."
# export GRAFANA_ADMIN_PASSWORD="..."
# export CLOUDFLARE_API_TOKEN="..."
# export CLOUDFLARE_ZONE_ID="..."
# export PORKBUN_API_KEY="..."

View file

@ -31,6 +31,7 @@ resolve_secret_var TAILSCALE_AUTH_KEY optional
resolve_secret_var NEAR_INTENTS_API_KEY required
resolve_secret_var REGISTRY_PASSWORD required
resolve_secret_var FORGEJO_ADMIN_PASSWORD required
resolve_secret_var GRAFANA_ADMIN_PASSWORD optional
resolve_secret_var CLOUDFLARE_API_TOKEN optional
resolve_secret_var CLOUDFLARE_ZONE_ID optional
resolve_secret_var PORKBUN_API_KEY optional
@ -40,10 +41,13 @@ resolve_secret_var PORKBUN_SECRET_API_KEY optional
: "${PUBLIC_DOMAIN:?set PUBLIC_DOMAIN}"
: "${LETSENCRYPT_EMAIL:?set LETSENCRYPT_EMAIL}"
: "${BASE_DOMAIN:?set BASE_DOMAIN}"
: "${FORGEJO_DOMAIN:=git.${BASE_DOMAIN}}"
: "${FORGEJO_DOMAIN:=git.${PUBLIC_DOMAIN}}"
: "${FORGEJO_ROOT_URL:=https://${FORGEJO_DOMAIN}/}"
: "${FORGEJO_INTERNAL_URL:=http://forgejo.forgejo.svc.cluster.local:3000/}"
: "${REGISTRY_DOMAIN:=registry.${BASE_DOMAIN}}"
: "${REGISTRY_DOMAIN:=registry.${PUBLIC_DOMAIN}}"
: "${GRAFANA_DOMAIN:=grafana.${PUBLIC_DOMAIN}}"
: "${GRAFANA_ROOT_URL:=https://${GRAFANA_DOMAIN}/}"
: "${GRAFANA_ADMIN_USERNAME:=admin}"
: "${REGISTRY_USERNAME:?set REGISTRY_USERNAME}"
: "${TAILSCALE_CONTROL_PLANE_HOSTNAME:=}"
: "${TF_ADMIN_CIDR_BLOCKS:=}"
@ -176,6 +180,15 @@ yaml.safe_dump(config, open(dst, 'w'), sort_keys=False)
PY
mkdir -p "$PROJECT_OVERLAY_DIR/secrets" "$GENERATED_OVERLAY_DIR"
OBSERVABILITY_SECRET_ENV_PATH="$PROJECT_OVERLAY_DIR/secrets/observability.env"
if [[ -z "${GRAFANA_ADMIN_PASSWORD:-}" ]]; then
GRAFANA_ADMIN_PASSWORD="$(python3 - <<'PY'
import secrets
print(secrets.token_urlsafe(24))
PY
)"
echo "GRAFANA_ADMIN_PASSWORD not provided; generated a random bootstrap password for Grafana admin user '$GRAFANA_ADMIN_USERNAME'" >&2
fi
cat > "$PROJECT_SECRET_ENV_PATH" <<EOF
NEAR_INTENTS_API_KEY=$NEAR_INTENTS_API_KEY
EOF
@ -183,6 +196,11 @@ cat > "$PROJECT_OVERLAY_DIR/secrets/forgejo.env" <<EOF
root_url=$FORGEJO_ROOT_URL
domain=$FORGEJO_DOMAIN
EOF
cat > "$OBSERVABILITY_SECRET_ENV_PATH" <<EOF
grafana_admin_user=$GRAFANA_ADMIN_USERNAME
grafana_admin_password=$GRAFANA_ADMIN_PASSWORD
grafana_root_url=$GRAFANA_ROOT_URL
EOF
python3 - <<PY
import os
from pathlib import Path
@ -197,26 +215,39 @@ platform_base = (root / "../../platform/base").resolve()
project_base = (root / project_kustomize_path).resolve() if project_kustomize_path else None
project_secret_env = (root / "secrets" / project_secret_env_basename).resolve()
forgejo_secret_env = (root / "secrets" / "forgejo.env").resolve()
platform_resources = [
platform_base / "namespace.yaml",
platform_base / "forgejo.yaml",
platform_base / "forgejo-rbac.yaml",
platform_base / "forgejo-runner.yaml",
platform_base / "registry.yaml",
platform_base / "ingress.yaml",
platform_base / "cluster-issuers.yaml",
platform_base / "coredns.yaml",
]
observability_secret_env = (root / "secrets" / "observability.env").resolve()
resources = [os.path.relpath(path, generated_root) for path in platform_resources]
resources = [os.path.relpath(platform_base, generated_root)]
if project_base:
resources.append(os.path.relpath(project_base, generated_root))
generated_root.mkdir(parents=True, exist_ok=True)
project_secret_env_rel = Path(project_secret_env.name)
forgejo_secret_env_rel = Path(forgejo_secret_env.name)
observability_secret_env_rel = Path(observability_secret_env.name)
(generated_root / project_secret_env_rel).write_text(project_secret_env.read_text())
(generated_root / forgejo_secret_env_rel).write_text(forgejo_secret_env.read_text())
if observability_secret_env.exists():
(generated_root / observability_secret_env_rel).write_text(
observability_secret_env.read_text()
)
secret_generator_entries = [
f" - name: {project_secret_name}\n"
f" namespace: {project_namespace}\n"
f" envs:\n"
f" - {project_secret_env_rel}\n",
" - name: forgejo-secrets\n"
" namespace: forgejo\n"
" envs:\n"
f" - {forgejo_secret_env_rel}\n",
]
if observability_secret_env.exists():
secret_generator_entries.append(
" - name: observability-secrets\n"
" namespace: observability\n"
" envs:\n"
f" - {observability_secret_env_rel}\n"
)
(generated_root / "kustomization.yaml").write_text(
"""apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
@ -228,26 +259,80 @@ resources:
- path: issuer-email.patch.yaml
- path: storage-class.patch.yaml
secretGenerator:
- name: {project_secret_name}
namespace: {project_namespace}
envs:
- {project_secret_env_rel}
- name: forgejo-secrets
namespace: forgejo
envs:
- {forgejo_secret_env_rel}
generatorOptions:
"""
+ "".join(secret_generator_entries)
+ """generatorOptions:
disableNameSuffixHash: true
""".format(
project_secret_name=project_secret_name,
project_namespace=project_namespace,
project_secret_env_rel=project_secret_env_rel,
forgejo_secret_env_rel=forgejo_secret_env_rel,
)
"""
)
(generated_root / "storage-class.patch.yaml").write_text((root / "storage-class.patch.yaml").read_text())
(generated_root / "issuer-email.patch.yaml").write_text(f'''apiVersion: cert-manager.io/v1\nkind: ClusterIssuer\nmetadata:\n name: letsencrypt-staging\nspec:\n acme:\n email: {"$LETSENCRYPT_EMAIL"}\n---\napiVersion: cert-manager.io/v1\nkind: ClusterIssuer\nmetadata:\n name: letsencrypt-production\nspec:\n acme:\n email: {"$LETSENCRYPT_EMAIL"}\n''')
(generated_root / "ingress-hosts.patch.yaml").write_text(f'''apiVersion: networking.k8s.io/v1\nkind: Ingress\nmetadata:\n name: forgejo\n namespace: forgejo\nspec:\n tls:\n - hosts:\n - {"$FORGEJO_DOMAIN"}\n secretName: forgejo-tls\n rules:\n - host: {"$FORGEJO_DOMAIN"}\n---\napiVersion: networking.k8s.io/v1\nkind: Ingress\nmetadata:\n name: registry\n namespace: registry\nspec:\n tls:\n - hosts:\n - {"$REGISTRY_DOMAIN"}\n secretName: registry-tls\n rules:\n - host: {"$REGISTRY_DOMAIN"}\n''')
(generated_root / "ingress-hosts.patch.yaml").write_text(f'''apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: forgejo
namespace: forgejo
spec:
tls:
- hosts:
- {"$FORGEJO_DOMAIN"}
secretName: forgejo-tls
rules:
- host: {"$FORGEJO_DOMAIN"}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: forgejo
port:
number: 3000
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: registry
namespace: registry
spec:
tls:
- hosts:
- {"$REGISTRY_DOMAIN"}
secretName: registry-tls
rules:
- host: {"$REGISTRY_DOMAIN"}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: registry
port:
number: 5000
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana
namespace: observability
spec:
tls:
- hosts:
- {"$GRAFANA_DOMAIN"}
secretName: grafana-tls
rules:
- host: {"$GRAFANA_DOMAIN"}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: grafana
port:
number: 3000
''')
PY
kubectl apply -f "$ROOT_DIR/deploy/k8s/platform/base/namespace.yaml"
@ -265,10 +350,13 @@ kubectl -n cert-manager delete deployment cert-manager cert-manager-webhook cert
kubectl apply -f "$ROOT_DIR/deploy/k8s/platform/base/cert-manager.yaml"
kubectl wait --for=condition=Established --timeout=180s crd/certificates.cert-manager.io
kubectl wait --for=condition=Established --timeout=180s crd/clusterissuers.cert-manager.io
kubectl apply -k "$PROJECT_OVERLAY_DIR"
kubectl apply -k "$GENERATED_OVERLAY_DIR"
kubectl -n forgejo rollout status deployment/forgejo --timeout=300s
kubectl -n registry rollout status deployment/registry --timeout=300s
kubectl -n observability rollout status deployment/loki --timeout=300s
kubectl -n observability rollout status deployment/grafana --timeout=300s
kubectl -n observability rollout status daemonset/promtail --timeout=300s
kubectl -n "$PROJECT_NAMESPACE" rollout status deployment/redpanda --timeout=300s
forgejo_admin_user_b64=$(printf '%s' "$FORGEJO_ADMIN_USERNAME" | base64 | tr -d '\n')
@ -424,4 +512,5 @@ echo "bootstrap_delivery_mode=$BOOTSTRAP_DELIVERY_MODE"
echo "forgejo_url=$FORGEJO_ROOT_URL"
echo "forgejo_repo=${FORGEJO_ROOT_URL%/}/$FORGEJO_REPO_OWNER/$FORGEJO_REPO_NAME"
echo "registry_url=https://$REGISTRY_DOMAIN"
echo "grafana_url=$GRAFANA_ROOT_URL"
echo "dns_provider=${CLOUDFLARE_API_TOKEN:+cloudflare}${PORKBUN_API_KEY:+porkbun}"

View file

@ -58,6 +58,7 @@ records=(
"$PUBLIC_DOMAIN"
"git.$PUBLIC_DOMAIN"
"registry.$PUBLIC_DOMAIN"
"grafana.$PUBLIC_DOMAIN"
)
case "$DNS_MODE" in
@ -66,12 +67,14 @@ case "$DNS_MODE" in
upsert_record A "${records[0]}" "$SERVER_IP" false
upsert_record A "${records[1]}" "$SERVER_IP" false
upsert_record A "${records[2]}" "$SERVER_IP" false
upsert_record A "${records[3]}" "$SERVER_IP" false
echo "cloudflare dns updated for ${records[*]}"
;;
delete)
delete_record A "${records[0]}"
delete_record A "${records[1]}"
delete_record A "${records[2]}"
delete_record A "${records[3]}"
echo "cloudflare dns cleanup finished for ${records[*]}"
;;
*)

View file

@ -27,9 +27,11 @@ fi
if [[ -n "$root_name" ]]; then
git_name="git.$root_name"
registry_name="registry.$root_name"
grafana_name="grafana.$root_name"
else
git_name="git"
registry_name="registry"
grafana_name="grafana"
fi
payload() {
@ -114,13 +116,15 @@ case "$DNS_MODE" in
upsert_a_record "$root_name"
upsert_a_record "$git_name"
upsert_a_record "$registry_name"
echo "porkbun dns updated for $PUBLIC_DOMAIN, git.$PUBLIC_DOMAIN, registry.$PUBLIC_DOMAIN"
upsert_a_record "$grafana_name"
echo "porkbun dns updated for $PUBLIC_DOMAIN, git.$PUBLIC_DOMAIN, registry.$PUBLIC_DOMAIN, grafana.$PUBLIC_DOMAIN"
;;
delete)
delete_a_record "$root_name"
delete_a_record "$git_name"
delete_a_record "$registry_name"
echo "porkbun dns cleanup finished for $PUBLIC_DOMAIN, git.$PUBLIC_DOMAIN, registry.$PUBLIC_DOMAIN"
delete_a_record "$grafana_name"
echo "porkbun dns cleanup finished for $PUBLIC_DOMAIN, git.$PUBLIC_DOMAIN, registry.$PUBLIC_DOMAIN, grafana.$PUBLIC_DOMAIN"
;;
*)
echo "unsupported DNS_MODE: $DNS_MODE" >&2