Fix Kubernetes smoke pod restart selection
All checks were successful
deploy / deploy (push) Successful in 16s

This commit is contained in:
philipp 2026-04-18 11:33:23 +02:00
parent e86de1b6a9
commit 38c7811252

View file

@ -107,20 +107,27 @@ def run_json(command, input_text=None, timeout=None):
def pod_ready(pod): def pod_ready(pod):
if pod.get('metadata', {}).get('deletionTimestamp'):
return False
if pod.get('status', {}).get('phase') != 'Running': if pod.get('status', {}).get('phase') != 'Running':
return False return False
statuses = pod.get('status', {}).get('containerStatuses') or [] statuses = pod.get('status', {}).get('containerStatuses') or []
return bool(statuses) and all(status.get('ready') for status in statuses) return bool(statuses) and all(status.get('ready') for status in statuses)
def get_collector_pod(): def get_collector_pod(exclude_names=None):
exclude_names = set(exclude_names or [])
selector = 'app.kubernetes.io/name=orderbooks,app.kubernetes.io/component=collector' selector = 'app.kubernetes.io/name=orderbooks,app.kubernetes.io/component=collector'
deadline = time.time() + wait_seconds deadline = time.time() + wait_seconds
last = None last = None
while time.time() <= deadline: while time.time() <= deadline:
pods = run_json([kubectl, '-n', namespace, 'get', 'pods', '-l', selector, '-o', 'json']) pods = run_json([kubectl, '-n', namespace, 'get', 'pods', '-l', selector, '-o', 'json'])
items = pods.get('items', []) items = pods.get('items', [])
ready = [pod for pod in items if pod_ready(pod)] ready = [
pod
for pod in items
if pod_ready(pod) and pod.get('metadata', {}).get('name') not in exclude_names
]
if ready: if ready:
ready.sort(key=lambda pod: pod.get('metadata', {}).get('creationTimestamp', '')) ready.sort(key=lambda pod: pod.get('metadata', {}).get('creationTimestamp', ''))
return ready[-1]['metadata']['name'], ready[-1] return ready[-1]['metadata']['name'], ready[-1]
@ -396,6 +403,7 @@ summary = {
'cronjob': cronjob, 'cronjob': cronjob,
'raw_dir': raw_dir, 'raw_dir': raw_dir,
'manifest_dir': manifest_dir, 'manifest_dir': manifest_dir,
'wait_seconds': wait_seconds,
'upload_min_age_seconds': upload_min_age_seconds, 'upload_min_age_seconds': upload_min_age_seconds,
'checks': checks, 'checks': checks,
'failures': failures, 'failures': failures,
@ -416,7 +424,7 @@ try:
rollout_after = run([kubectl, '-n', namespace, 'rollout', 'status', f'deployment/{deployment}', f'--timeout={wait_seconds}s']) rollout_after = run([kubectl, '-n', namespace, 'rollout', 'status', f'deployment/{deployment}', f'--timeout={wait_seconds}s'])
if rollout_after['exit_code'] != 0: if rollout_after['exit_code'] != 0:
raise RuntimeError('collector deployment did not recover after pod delete') raise RuntimeError('collector deployment did not recover after pod delete')
new_pod, new_pod_obj = get_collector_pod() new_pod, new_pod_obj = get_collector_pod(exclude_names={pod_name})
old_check = exec_python(new_pod, raw_check_code, [old_file['path'], old_file['actual_sha256'], str(old_file['rows_parsed'])]) old_check = exec_python(new_pod, raw_check_code, [old_file['path'], old_file['actual_sha256'], str(old_file['rows_parsed'])])
if not old_check.get('sha256_matches') or not old_check.get('row_count_matches'): if not old_check.get('sha256_matches') or not old_check.get('row_count_matches'):
raise RuntimeError('old raw file changed or stopped parsing after pod restart') raise RuntimeError('old raw file changed or stopped parsing after pod restart')