Compare commits

..

2 commits

Author SHA1 Message Date
philipp
82017dd301 Guard quote ingest against node OOM
Some checks failed
deploy / deploy (push) Failing after 47s
Proof: Live investigation showed doran-1 entered NodeNotReady with kubelet SystemOOM and TLS/control-plane timeouts while near-intents-ingest, history-writer, and operator-dashboard were the largest Node memory consumers. This commit adds websocket publish backpressure for the raw quote firehose and pod memory guardrails for the affected services.

Assumptions: Dropping quote frames while Kafka publishing is backpressured is safer than allowing unbounded in-flight publishes to take down the single-node cluster; retained Kafka/Postgres history remains best-effort under overload until the platform has enough capacity for full raw retention.

Still fake: This does not add durable queue spillover for skipped raw websocket frames, does not resize the node, and does not prove fee-complete trading PnL.
2026-05-13 18:25:54 +02:00
philipp
3cd88c682e Expose operator dashboard with basic auth
Proof: The rendered Kubernetes manifest now includes a Traefik TLS ingress for operator-dashboard and production basic auth, deploy workflow/bootstrap preserve the dashboard password as a secret, and static plus full node tests pass.

Assumptions: doran.133011.xyz is the intended public host because unrip.doran.133011.xyz and dashboard.doran.133011.xyz do not currently resolve.

Still fake: the public dashboard is not deployed or externally verified yet because the cluster host, Forgejo, and Kubernetes API timed out from this machine during this turn.
2026-05-13 18:08:27 +02:00
10 changed files with 274 additions and 38 deletions

View file

@ -80,6 +80,36 @@ jobs:
run: |
kubectl apply -f "$WORKSPACE_DIR/deploy/k8s/base/namespace.yaml"
- name: Upsert runtime secrets
env:
OPERATOR_DASHBOARD_AUTH_PASSWORD: ${{ secrets.OPERATOR_DASHBOARD_AUTH_PASSWORD }}
run: |
if [ -z "$OPERATOR_DASHBOARD_AUTH_PASSWORD" ]; then
echo "missing required repo action secret OPERATOR_DASHBOARD_AUTH_PASSWORD" >&2
exit 1
fi
patch_file="$(mktemp)"
cleanup() {
rm -f "$patch_file"
}
trap cleanup EXIT
python3 - "$OPERATOR_DASHBOARD_AUTH_PASSWORD" >"$patch_file" <<'PY'
import json
import sys
print(json.dumps({
"stringData": {
"OPERATOR_DASHBOARD_AUTH_PASSWORD": sys.argv[1],
},
}))
PY
kubectl -n "$PROJECT_NAMESPACE" patch secret "${PROJECT_NAME}-secrets" \
--type merge \
--patch-file "$patch_file"
- name: Build and push image in-cluster
env:
REPO_TOKEN: ${{ github.token }}

View file

@ -89,7 +89,8 @@ data:
OPS_SENTINEL_INVENTORY_STALE_MS: "30000"
OPS_SENTINEL_FUNDING_CREDIT_PENDING_MS: "300000"
OPS_SENTINEL_FUNDING_STUCK_MS: "3600000"
OPERATOR_DASHBOARD_AUTH_MODE: stub
OPERATOR_DASHBOARD_AUTH_MODE: basic
OPERATOR_DASHBOARD_AUTH_USERNAME: admin
OPERATOR_DASHBOARD_QUOTE_LIMIT: "10"
OPERATOR_DASHBOARD_TRADE_PAGE_SIZE: "20"
OPERATOR_DASHBOARD_UPSTREAM_TIMEOUT_MS: "3000"
@ -244,6 +245,31 @@ spec:
port: 8090
targetPort: 8090
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: operator-dashboard
namespace: unrip
annotations:
cert-manager.io/cluster-issuer: letsencrypt-production
spec:
ingressClassName: traefik
tls:
- hosts:
- doran.133011.xyz
secretName: operator-dashboard-tls
rules:
- host: doran.133011.xyz
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: operator-dashboard
port:
number: 8090
---
apiVersion: apps/v1
kind: Deployment
metadata:
@ -267,6 +293,9 @@ spec:
image: ghcr.io/example/unrip:bootstrap
imagePullPolicy: IfNotPresent
command: ["node", "src/apps/near-intents-ingest.mjs"]
env:
- name: NODE_OPTIONS
value: "--max-old-space-size=896"
ports:
- name: control-api
containerPort: 8081
@ -275,6 +304,11 @@ spec:
name: unrip-config
- secretRef:
name: unrip-secrets
resources:
requests:
memory: 256Mi
limits:
memory: 1280Mi
---
apiVersion: apps/v1
kind: Deployment
@ -402,6 +436,9 @@ spec:
image: ghcr.io/example/unrip:bootstrap
imagePullPolicy: IfNotPresent
command: ["node", "src/apps/history-writer.mjs"]
env:
- name: NODE_OPTIONS
value: "--max-old-space-size=896"
ports:
- name: control-api
containerPort: 8085
@ -410,6 +447,11 @@ spec:
name: unrip-config
- secretRef:
name: unrip-secrets
resources:
requests:
memory: 256Mi
limits:
memory: 1280Mi
---
apiVersion: apps/v1
kind: Deployment
@ -544,6 +586,9 @@ spec:
image: ghcr.io/example/unrip:bootstrap
imagePullPolicy: IfNotPresent
command: ["node", "src/apps/operator-dashboard.mjs"]
env:
- name: NODE_OPTIONS
value: "--max-old-space-size=896"
ports:
- name: http
containerPort: 8090
@ -552,3 +597,8 @@ spec:
name: unrip-config
- secretRef:
name: unrip-secrets
resources:
requests:
memory: 256Mi
limits:
memory: 1280Mi

View file

@ -136,6 +136,7 @@ fi
: "${POSTGRES_URL:=}"
: "${NEAR_INTENTS_SIGNER_PRIVATE_KEY:=}"
: "${NOTIFICATION_NTFY_TOKEN:=}"
: "${OPERATOR_DASHBOARD_AUTH_PASSWORD:=}"
secret_value() {
local key="$1"
@ -158,6 +159,10 @@ if [[ -z "$NOTIFICATION_NTFY_TOKEN" ]]; then
NOTIFICATION_NTFY_TOKEN="$(secret_value NOTIFICATION_NTFY_TOKEN)"
fi
if [[ -z "$OPERATOR_DASHBOARD_AUTH_PASSWORD" ]]; then
OPERATOR_DASHBOARD_AUTH_PASSWORD="$(secret_value OPERATOR_DASHBOARD_AUTH_PASSWORD)"
fi
if [[ -z "$POSTGRES_PASSWORD" ]]; then
POSTGRES_PASSWORD="$(python3 - <<'PY'
import secrets
@ -170,6 +175,8 @@ if [[ -z "$POSTGRES_URL" ]]; then
POSTGRES_URL="postgresql://unrip:${POSTGRES_PASSWORD}@postgres:5432/unrip"
fi
: "${OPERATOR_DASHBOARD_AUTH_PASSWORD:?set OPERATOR_DASHBOARD_AUTH_PASSWORD or pre-create OPERATOR_DASHBOARD_AUTH_PASSWORD in $APP_SECRET_NAME}"
echo "bootstrapping namespace $PROJECT_NAMESPACE"
kubectl apply -f "$ROOT_DIR/deploy/k8s/base/namespace.yaml"
@ -178,6 +185,7 @@ secret_args=(
--from-literal=NEAR_INTENTS_API_KEY="$NEAR_INTENTS_API_KEY"
--from-literal=POSTGRES_PASSWORD="$POSTGRES_PASSWORD"
--from-literal=POSTGRES_URL="$POSTGRES_URL"
--from-literal=OPERATOR_DASHBOARD_AUTH_PASSWORD="$OPERATOR_DASHBOARD_AUTH_PASSWORD"
)
if [[ -n "$NEAR_INTENTS_SIGNER_PRIVATE_KEY" ]]; then
secret_args+=(--from-literal=NEAR_INTENTS_SIGNER_PRIVATE_KEY="$NEAR_INTENTS_SIGNER_PRIVATE_KEY")
@ -224,6 +232,9 @@ fi
if [[ -n "${FORGEJO_ADMIN_PASSWORD:-}" ]]; then
forgejo_args+=(--admin-password "$FORGEJO_ADMIN_PASSWORD")
fi
if [[ -n "${OPERATOR_DASHBOARD_AUTH_PASSWORD:-}" ]]; then
forgejo_args+=(--operator-dashboard-auth-password "$OPERATOR_DASHBOARD_AUTH_PASSWORD")
fi
python3 "$ROOT_DIR/scripts/deploy/forgejo_repo_bootstrap.py" \
--forgejo-url "$FORGEJO_URL" \

View file

@ -107,6 +107,7 @@ def main():
parser.add_argument('--project-namespace', required=True)
parser.add_argument('--project-deployments', required=True)
parser.add_argument('--project-registry-secret-name', required=True)
parser.add_argument('--operator-dashboard-auth-password')
args = parser.parse_args()
client = ForgejoClient(args.forgejo_url, args.admin_username, args.admin_password, args.token)
@ -120,6 +121,14 @@ def main():
kubeconfig_b64 = base64.b64encode(Path(args.ci_kubeconfig).read_bytes()).decode()
client.upsert_secret(args.repo_owner, args.repo_name, 'KUBECONFIG_B64', kubeconfig_b64)
print('upserted repo action secret KUBECONFIG_B64')
if args.operator_dashboard_auth_password:
client.upsert_secret(
args.repo_owner,
args.repo_name,
'OPERATOR_DASHBOARD_AUTH_PASSWORD',
args.operator_dashboard_auth_password,
)
print('upserted repo action secret OPERATOR_DASHBOARD_AUTH_PASSWORD')
client.upsert_variable(args.repo_owner, args.repo_name, 'REGISTRY_HOST', args.registry_host)
client.upsert_variable(args.repo_owner, args.repo_name, 'PROJECT_NAME', args.project_name)

View file

@ -35,6 +35,7 @@ export async function startNearIntentsWs({
let framesReceived = 0;
let quoteFramesReceived = 0;
let filteredCount = 0;
let backpressureSkippedCount = 0;
let publishErrorCount = 0;
let invalidJsonCount = 0;
let lastMessageAt = null;
@ -101,44 +102,40 @@ export async function startNearIntentsWs({
if (quoteStatusSubscriptionId && subscription === quoteStatusSubscriptionId) return;
if (quoteSubscriptionId && subscription && subscription !== quoteSubscriptionId) return;
if (publishLocked) return;
const envelope = buildNearIntentsQuoteEnvelope(merged);
const rawEnvelope = buildNearIntentsRawEnvelope(merged);
try {
await producer.sendJson(rawTopic, rawEnvelope, { key: rawEnvelope.event_id });
rawPublishedCount += 1;
} catch (error) {
publishErrorCount += 1;
logger?.error('raw_publish_failed', {
namespace,
topic: rawTopic,
details: {
error: serializeError(error),
quote_id: rawEnvelope.payload?.message?.quote_id || rawEnvelope.payload?.message?.quote_hash || null,
},
});
}
if (!envelope) return;
assertNormalizedSwapDemand(envelope);
const assetIn = envelope.payload?.asset_in;
const assetOut = envelope.payload?.asset_out;
if (!assetIn || !assetOut) return;
const pairAllowed = matchesPair
? await matchesPair(assetIn, assetOut)
: matchesPairFilter(assetIn, assetOut, getPairFilter());
if (!pairAllowed) {
filteredCount += 1;
if (publishLocked) {
backpressureSkippedCount += 1;
return;
}
lastMatchingQuoteAt = new Date().toISOString();
publishLocked = true;
let envelope = null;
let rawEnvelope = null;
let assetIn = null;
let assetOut = null;
let publishTopic = rawTopic;
try {
envelope = buildNearIntentsQuoteEnvelope(merged);
rawEnvelope = buildNearIntentsRawEnvelope(merged);
await producer.sendJson(rawTopic, rawEnvelope, { key: rawEnvelope.event_id });
rawPublishedCount += 1;
if (!envelope) return;
assertNormalizedSwapDemand(envelope);
assetIn = envelope.payload?.asset_in;
assetOut = envelope.payload?.asset_out;
if (!assetIn || !assetOut) return;
const pairAllowed = matchesPair
? await matchesPair(assetIn, assetOut)
: matchesPairFilter(assetIn, assetOut, getPairFilter());
if (!pairAllowed) {
filteredCount += 1;
return;
}
lastMatchingQuoteAt = new Date().toISOString();
publishTopic = normalizedTopic;
await producer.sendJson(normalizedTopic, envelope, { key: envelope.payload.quote_id });
publishedCount += 1;
lastPublishedAt = new Date().toISOString();
@ -146,14 +143,17 @@ export async function startNearIntentsWs({
onPublish(envelope, publishedCount);
} catch (error) {
publishErrorCount += 1;
logger?.error('publish_failed', {
logger?.error(publishTopic === rawTopic ? 'raw_publish_failed' : 'publish_failed', {
namespace,
topic: normalizedTopic,
pair: `${assetIn}->${assetOut}`,
topic: publishTopic,
pair: assetIn && assetOut ? `${assetIn}->${assetOut}` : null,
details: {
raw_topic: rawTopic,
error: serializeError(error),
quote_id: envelope.payload?.quote_id,
quote_id: envelope?.payload?.quote_id
|| rawEnvelope?.payload?.message?.quote_id
|| rawEnvelope?.payload?.message?.quote_hash
|| null,
},
});
} finally {
@ -221,6 +221,7 @@ export async function startNearIntentsWs({
frames_received: framesReceived,
quote_frames_received: quoteFramesReceived,
filtered_count: filteredCount,
backpressure_skipped_count: backpressureSkippedCount,
raw_published_count: rawPublishedCount,
published_count: publishedCount,
publish_error_count: publishErrorCount,

View file

@ -13,6 +13,13 @@ class BootstrapScriptStaticTest(unittest.TestCase):
self.assertIn('ghcr.io/example/unrip:bootstrap', source)
self.assertIn('kubectl kustomize "$ROOT_DIR/deploy/k8s/base"', source)
def test_bootstrap_preserves_operator_dashboard_password_secret(self):
source = (ROOT / 'scripts/deploy/bootstrap.sh').read_text()
self.assertIn('OPERATOR_DASHBOARD_AUTH_PASSWORD="$(secret_value OPERATOR_DASHBOARD_AUTH_PASSWORD)"', source)
self.assertIn('OPERATOR_DASHBOARD_AUTH_PASSWORD:?set OPERATOR_DASHBOARD_AUTH_PASSWORD', source)
self.assertIn('--from-literal=OPERATOR_DASHBOARD_AUTH_PASSWORD="$OPERATOR_DASHBOARD_AUTH_PASSWORD"', source)
self.assertIn('--operator-dashboard-auth-password "$OPERATOR_DASHBOARD_AUTH_PASSWORD"', source)
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,20 @@
import test from 'node:test';
import assert from 'node:assert/strict';
import { readFileSync } from 'node:fs';
const workflow = readFileSync(new URL('../.forgejo/workflows/deploy.yml', import.meta.url), 'utf8');
const forgejoBootstrap = readFileSync(new URL('../scripts/deploy/forgejo_repo_bootstrap.py', import.meta.url), 'utf8');
test('deploy workflow upserts dashboard password before applying public dashboard manifest', () => {
assert.match(workflow, /name: Upsert runtime secrets/);
assert.match(workflow, /OPERATOR_DASHBOARD_AUTH_PASSWORD: \$\{\{ secrets\.OPERATOR_DASHBOARD_AUTH_PASSWORD \}\}/);
assert.match(workflow, /missing required repo action secret OPERATOR_DASHBOARD_AUTH_PASSWORD/);
assert.match(workflow, /patch secret "\$\{PROJECT_NAME\}-secrets"/);
assert.match(workflow, /--patch-file "\$patch_file"/);
});
test('Forgejo bootstrap can publish dashboard password as a repo action secret', () => {
assert.match(forgejoBootstrap, /--operator-dashboard-auth-password/);
assert.match(forgejoBootstrap, /OPERATOR_DASHBOARD_AUTH_PASSWORD/);
assert.match(forgejoBootstrap, /upserted repo action secret OPERATOR_DASHBOARD_AUTH_PASSWORD/);
});

View file

@ -143,3 +143,67 @@ test('near intents websocket close is reentrant-safe when close emits an error',
mock.restore();
}
});
test('near intents websocket skips quote frames while Kafka publish is backpressured', async () => {
const mock = installMockWebSocket();
let releaseRawPublish;
const sends = [];
const producer = {
async sendJson(topic, event) {
sends.push({ topic, event });
if (topic === 'raw.near_intents.quote' && !releaseRawPublish) {
await new Promise((resolve) => {
releaseRawPublish = resolve;
});
}
},
};
const client = await startNearIntentsWs({
apiKey: 'api-key',
wsUrl: 'wss://relay.example/ws',
pairFilter: ['btc', 'eure'],
producer,
rawTopic: 'raw.near_intents.quote',
normalizedTopic: 'norm.swap_demand',
reconnectDelayMs: 1,
});
function quote(quoteId) {
return {
method: 'event',
params: {
data: {
quote_id: quoteId,
defuse_asset_identifier_in: 'btc',
defuse_asset_identifier_out: 'eure',
exact_amount_in: '100',
},
},
};
}
try {
mock.instances[0].open();
mock.instances[0].emit('message', { data: JSON.stringify(quote('quote-1')) });
mock.instances[0].emit('message', { data: JSON.stringify(quote('quote-2')) });
await delay(5);
assert.equal(client.getState().backpressure_skipped_count, 1);
assert.deepEqual(sends.map((entry) => entry.event.payload?.quote_id || entry.event.payload?.message?.quote_id), [
'quote-1',
]);
releaseRawPublish();
await delay(5);
assert.equal(client.getState().raw_published_count, 1);
assert.equal(client.getState().published_count, 1);
assert.deepEqual(sends.map((entry) => entry.topic), [
'raw.near_intents.quote',
'norm.swap_demand',
]);
} finally {
client.close();
mock.restore();
}
});

View file

@ -0,0 +1,21 @@
import test from 'node:test';
import assert from 'node:assert/strict';
import { readFileSync } from 'node:fs';
const manifest = readFileSync(new URL('../deploy/k8s/base/unrip.yaml', import.meta.url), 'utf8');
test('operator dashboard production manifest uses basic auth with password from secret', () => {
assert.match(manifest, /OPERATOR_DASHBOARD_AUTH_MODE:\s+basic/);
assert.match(manifest, /OPERATOR_DASHBOARD_AUTH_USERNAME:\s+admin/);
assert.doesNotMatch(manifest, /OPERATOR_DASHBOARD_AUTH_PASSWORD:/);
assert.match(manifest, /secretRef:\s*\n\s+name: unrip-secrets/);
});
test('operator dashboard has a public Traefik ingress with TLS', () => {
assert.match(manifest, /kind: Ingress\s*\nmetadata:\s*\n\s+name: operator-dashboard/);
assert.match(manifest, /cert-manager\.io\/cluster-issuer: letsencrypt-production/);
assert.match(manifest, /ingressClassName: traefik/);
assert.match(manifest, /host: doran\.133011\.xyz/);
assert.match(manifest, /secretName: operator-dashboard-tls/);
assert.match(manifest, /service:\s*\n\s+name: operator-dashboard\s*\n\s+port:\s*\n\s+number: 8090/);
});

View file

@ -0,0 +1,23 @@
import test from 'node:test';
import assert from 'node:assert/strict';
import { readFileSync } from 'node:fs';
const manifest = readFileSync(new URL('../deploy/k8s/base/unrip.yaml', import.meta.url), 'utf8');
function deploymentBlock(name) {
const pattern = new RegExp(
`kind: Deployment\\nmetadata:\\n name: ${name}\\n[\\s\\S]*?(?=\\n---\\napiVersion:|\\n?$)`,
);
const match = manifest.match(pattern);
assert.ok(match, `expected deployment ${name}`);
return match[0];
}
for (const name of ['near-intents-ingest', 'history-writer', 'operator-dashboard']) {
test(`${name} has memory guardrails for live quote pressure`, () => {
const block = deploymentBlock(name);
assert.match(block, /name: NODE_OPTIONS\s+value: "--max-old-space-size=896"/);
assert.match(block, /resources:\s+requests:\s+memory: 256Mi\s+limits:\s+memory: 1280Mi/);
});
}