Skip to content

Commit 85fa2bf

Browse files
committed
fix: harden test suite for real K8s — poll_until, retry loops, SDK methods
1 parent 2c1387b commit 85fa2bf

13 files changed

Lines changed: 232 additions & 141 deletions

docs/NORTHSTAR.txt

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6688,6 +6688,53 @@ TEST ISOLATION + CLEANUP
66886688
isolation as unit tests.
66896689

66906690

6691+
REAL K8S TESTING RULES — HARD-WON, NON-NEGOTIABLE
6692+
──────────────────────────────────────────────────
6693+
These rules prevent flaky CI failures on real K8s (--backend real).
6694+
Every new test touching real K8s MUST follow these.
6695+
6696+
1. NEVER use raw k8s.update_deployment/service/statefulset() in tests
6697+
that run on real K8s. Deployments have controllers that race to
6698+
mutate resourceVersion → 409 Conflict. Use SDK methods (ps.scale,
6699+
deploy.deploy) which have built-in retry logic.
6700+
6701+
2. NEVER assert immediately after delete. K8s deletion is async.
6702+
Use poll_until(fn=_get_or_none, predicate=lambda r: r is None).
6703+
6704+
3. NEVER use time.sleep() for stabilization. Use poll_until() or
6705+
wait_for_port_open(). Sleeps waste CI time and still flake.
6706+
6707+
4. NEVER create Deployments/Services with raw k8s.create_*() in tests
6708+
that run on real K8s. Use factory.deploy.deploy() which sets
6709+
correct labels for cleanup.
6710+
6711+
5. Contract test updates MUST use read-modify-write + retry loop:
6712+
for _attempt in range(5):
6713+
current = k8s.get_deployment(ns, name)
6714+
current["spec"]["replicas"] = 3
6715+
try:
6716+
k8s.update_deployment(ns, name, current)
6717+
break
6718+
except FakeConflictError:
6719+
continue
6720+
6721+
6. Namespace tests MUST clean up — use request.addfinalizer() and
6722+
k8ut- prefix so session cleanup catches strays.
6723+
6724+
7. _wipe_namespace_resources() MUST wait for ALL resource types
6725+
(Deployment, StatefulSet, Service, Ingress, NetworkPolicy, PVC,
6726+
ConfigMap, Secret) — not just a subset.
6727+
6728+
8. Know which fixtures route to real K8s:
6729+
- factory, k8s_client → real with --backend real
6730+
- app_factory, deployed_factory, fake_k8s → always FakeK8sClient
6731+
6732+
Shared E2E helpers (tests/e2e/helpers.py):
6733+
poll_until, wait_for_pods_ready, wait_for_port_open,
6734+
wait_for_sts_ready, wait_for_service_deleted,
6735+
wait_for_lb_address, wait_for_no_pods, find_running_pod
6736+
6737+
66916738
UNIT TEST EXAMPLES
66926739
──────────────────
66936740
def test_apps_create(fake_client):

tests/apps/test_destroy.py

Lines changed: 2 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -90,42 +90,10 @@ def test_destroy_cleans_up_deployments_and_services(
9090
) -> None:
9191
"""Destroy also deletes deployments and services with matching labels."""
9292
apps_service.create("myapi")
93+
# Use SDK deploy (creates Deployment + Service with correct labels)
94+
factory.deploy.deploy("myapi", image="nginx:1.27", wait=False)
9395
k8s = factory.k8s
9496
ns = factory.namespace
95-
# Simulate a deployment and service existing for this app
96-
app_labels = {
97-
f"{factory.domain}/managed-by": factory.prefix,
98-
f"{factory.domain}/app": "myapi",
99-
}
100-
k8s.create_deployment(
101-
ns,
102-
{
103-
"apiVersion": "apps/v1",
104-
"kind": "Deployment",
105-
"metadata": {"name": f"{factory.prefix}-myapi-web", "labels": dict(app_labels)},
106-
"spec": {
107-
"replicas": 1,
108-
"selector": {"matchLabels": {"app": "myapi-web"}},
109-
"template": {
110-
"metadata": {"labels": {"app": "myapi-web"}},
111-
"spec": {"containers": [{"name": "web", "image": "nginx"}]},
112-
},
113-
},
114-
},
115-
)
116-
k8s.create_service(
117-
ns,
118-
{
119-
"apiVersion": "v1",
120-
"kind": "Service",
121-
"metadata": {"name": f"{factory.prefix}-myapi-web", "labels": dict(app_labels)},
122-
"spec": {
123-
"type": "ClusterIP",
124-
"selector": {"app": "myapi-web"},
125-
"ports": [{"port": 80, "targetPort": 8080, "protocol": "TCP"}],
126-
},
127-
},
128-
)
12997
apps_service.destroy("myapi")
13098
# Verify everything is gone
13199
assert k8s.list_deployments(ns, labels={f"{factory.domain}/app": "myapi"}) == []

tests/apps/test_info.py

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
from __future__ import annotations
44

5-
import json
65
from unittest.mock import patch
76

87
import pytest
@@ -38,21 +37,10 @@ def test_info_with_formation_data(
3837
) -> None:
3938
"""info() parses formation JSON data into ProcessFormation."""
4039
apps_service.create("myapi")
41-
# Write formation data directly
42-
k8s = factory.k8s
43-
ns = factory.namespace
44-
p = factory.prefix
45-
cm = k8s.get_configmap(ns, f"{p}-formation-myapi")
46-
cm["data"] = {
47-
"web": json.dumps(
48-
{
49-
"replicas": 3,
50-
"command": "gunicorn app:app",
51-
"command_source": "manual",
52-
}
53-
),
54-
}
55-
k8s.update_configmap(ns, f"{p}-formation-myapi", cm)
40+
factory.deploy.deploy("myapi", image="nginx:1.27", wait=False)
41+
# Use SDK methods (handles 409 retries on real K8s)
42+
factory.ps.scale("myapi", {"web": 3})
43+
factory.ps.set_commands("myapi", {"web": "gunicorn app:app"}, no_restart=True)
5644
app = apps_service.info("myapi")
5745
assert "web" in app.process_types
5846
assert app.process_types["web"].replicas == 3

tests/apps/test_rename.py

Lines changed: 2 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -77,42 +77,10 @@ def test_rename_updates_deployments_and_services(
7777
) -> None:
7878
"""Rename updates labels on deployments and services too."""
7979
apps_service.create("oldapp")
80+
# Use SDK deploy (creates Deployment + Service with correct labels)
81+
factory.deploy.deploy("oldapp", image="nginx:1.27", wait=False)
8082
k8s = factory.k8s
8183
ns = factory.namespace
82-
# Simulate deployment and service
83-
app_labels = {
84-
f"{factory.domain}/managed-by": factory.prefix,
85-
f"{factory.domain}/app": "oldapp",
86-
}
87-
k8s.create_deployment(
88-
ns,
89-
{
90-
"apiVersion": "apps/v1",
91-
"kind": "Deployment",
92-
"metadata": {"name": f"{factory.prefix}-oldapp-web", "labels": dict(app_labels)},
93-
"spec": {
94-
"replicas": 1,
95-
"selector": {"matchLabels": {"app": "oldapp-web"}},
96-
"template": {
97-
"metadata": {"labels": {"app": "oldapp-web"}},
98-
"spec": {"containers": [{"name": "web", "image": "nginx"}]},
99-
},
100-
},
101-
},
102-
)
103-
k8s.create_service(
104-
ns,
105-
{
106-
"apiVersion": "v1",
107-
"kind": "Service",
108-
"metadata": {"name": f"{factory.prefix}-oldapp-web", "labels": dict(app_labels)},
109-
"spec": {
110-
"type": "ClusterIP",
111-
"selector": {"app": "oldapp-web"},
112-
"ports": [{"port": 80, "targetPort": 8080, "protocol": "TCP"}],
113-
},
114-
},
115-
)
11684
apps_service.rename("oldapp", "newapp")
11785
# Verify labels were updated
11886
deps = k8s.list_deployments(ns, labels={f"{factory.domain}/app": "newapp"})

tests/apps/test_status.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,13 @@
22

33
from __future__ import annotations
44

5-
import time
6-
75
import pytest
86
from click.testing import CliRunner
97

108
from kuberoku.cli.main import cli
119
from kuberoku.exceptions import AppNotFoundError
1210
from kuberoku.factory import KuberokuFactory
11+
from tests.e2e.helpers import poll_until
1312
from tests.services.conftest import _create_fake_pod
1413

1514

@@ -36,12 +35,14 @@ def test_status_with_deploy(
3635
if request.config.getoption("--backend") == "fake":
3736
_create_fake_pod(factory, "myapi", "web", "pod-a", image="nginx:1.27")
3837
else:
39-
# Real K8s: deployment creates its own pod — wait for it
40-
for _ in range(30):
41-
st = factory.apps.status("myapi")
42-
if st.dynos:
43-
break
44-
time.sleep(1)
38+
# Real K8s: deployment creates its own pod — poll until visible
39+
poll_until(
40+
fn=lambda: factory.apps.status("myapi"),
41+
predicate=lambda st: len(st.dynos) >= 1,
42+
timeout=60,
43+
interval=1,
44+
message="Pod never appeared in status after deploy",
45+
)
4546
st = factory.apps.status("myapi")
4647
assert len(st.dynos) >= 1
4748
assert st.dynos[0].process_type == "web"
@@ -105,6 +106,15 @@ def test_status_cli_with_release_and_processes(
105106
factory.deploy.deploy("myapi", image="nginx:1.27", wait=False)
106107
if request.config.getoption("--backend") == "fake":
107108
_create_fake_pod(factory, "myapi", "web", "pod-a", image="nginx:1.27")
109+
else:
110+
# Real K8s: wait for pod before checking CLI output
111+
poll_until(
112+
fn=lambda: factory.apps.status("myapi"),
113+
predicate=lambda st: len(st.dynos) >= 1,
114+
timeout=60,
115+
interval=1,
116+
message="Pod never appeared in status after deploy",
117+
)
108118
factory.config.set("myapi", {"DB": "pg://x"})
109119
factory.config.set("myapi", {"KEY": "secret"}, secret=True)
110120
factory.addons.create("myapi", "postgres")

tests/conftest.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -226,23 +226,37 @@ def _wipe_namespace_resources(ns_name: str) -> None:
226226
remaining = 0
227227
with contextlib.suppress(Exception):
228228
remaining += len(
229-
net_v1.list_namespaced_network_policy(ns_name, label_selector=label_sel).items
229+
apps_v1.list_namespaced_deployment(ns_name, label_selector=label_sel).items
230230
)
231231
with contextlib.suppress(Exception):
232232
remaining += len(
233-
core.list_namespaced_config_map(ns_name, label_selector=label_sel).items
233+
apps_v1.list_namespaced_stateful_set(ns_name, label_selector=label_sel).items
234234
)
235235
with contextlib.suppress(Exception):
236236
remaining += len(
237-
core.list_namespaced_secret(ns_name, label_selector=label_sel).items
237+
core.list_namespaced_service(ns_name, label_selector=label_sel).items
238238
)
239239
with contextlib.suppress(Exception):
240240
remaining += len(
241-
core.list_namespaced_service(ns_name, label_selector=label_sel).items
241+
net_v1.list_namespaced_ingress(ns_name, label_selector=label_sel).items
242242
)
243243
with contextlib.suppress(Exception):
244244
remaining += len(
245-
apps_v1.list_namespaced_deployment(ns_name, label_selector=label_sel).items
245+
net_v1.list_namespaced_network_policy(ns_name, label_selector=label_sel).items
246+
)
247+
with contextlib.suppress(Exception):
248+
remaining += len(
249+
core.list_namespaced_persistent_volume_claim(
250+
ns_name, label_selector=label_sel
251+
).items
252+
)
253+
with contextlib.suppress(Exception):
254+
remaining += len(
255+
core.list_namespaced_config_map(ns_name, label_selector=label_sel).items
256+
)
257+
with contextlib.suppress(Exception):
258+
remaining += len(
259+
core.list_namespaced_secret(ns_name, label_selector=label_sel).items
246260
)
247261
if remaining == 0:
248262
return

tests/contract/test_deployment_crud.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import pytest
1212

1313
from tests.backends.fake import FakeConflictError, FakeNotFoundError
14+
from tests.e2e.helpers import poll_until
1415

1516

1617
def _deployment_body(
@@ -86,8 +87,20 @@ def test_delete_then_get_raises(self, k8s_client, tmp_namespace):
8687
body = _deployment_body("dep-del")
8788
k8s_client.create_deployment(tmp_namespace, body)
8889
k8s_client.delete_deployment(tmp_namespace, "dep-del")
89-
with pytest.raises(FakeNotFoundError):
90-
k8s_client.get_deployment(tmp_namespace, "dep-del")
90+
91+
def _get_or_none() -> dict[str, Any] | None:
92+
try:
93+
return k8s_client.get_deployment(tmp_namespace, "dep-del")
94+
except FakeNotFoundError:
95+
return None
96+
97+
poll_until(
98+
fn=_get_or_none,
99+
predicate=lambda result: result is None,
100+
timeout=30,
101+
interval=0.5,
102+
message="Deployment dep-del still exists after deletion",
103+
)
91104

92105
def test_update_changes_labels(self, k8s_client, tmp_namespace):
93106
body = _deployment_body("dep-upd", extra_meta_labels={"version": "v1"})

tests/contract/test_ingress_crud.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from __future__ import annotations
99

1010
import contextlib
11-
import time
1211
from typing import Any
1312

1413
import pytest
@@ -74,13 +73,19 @@ def test_get_not_found_raises(self, k8s_client, tmp_namespace) -> None:
7473

7574

7675
class TestIngressUpdate:
77-
def test_update_changes_rules(self, k8s_client, tmp_namespace, request) -> None:
76+
def test_update_changes_rules(self, k8s_client, tmp_namespace) -> None:
7877
k8s_client.create_ingress(tmp_namespace, _ingress_body("my-ingress", "old.com"))
79-
if request.config.getoption("--backend") == "real":
80-
time.sleep(1) # K8s controller mutates ingress after create
81-
ingress = k8s_client.get_ingress(tmp_namespace, "my-ingress")
82-
ingress["spec"]["rules"][0]["host"] = "new.com"
83-
k8s_client.update_ingress(tmp_namespace, "my-ingress", ingress)
78+
# Read-modify-write with retry: real K8s ingress controller may mutate
79+
for _attempt in range(5):
80+
ingress = k8s_client.get_ingress(tmp_namespace, "my-ingress")
81+
ingress["spec"]["rules"][0]["host"] = "new.com"
82+
try:
83+
k8s_client.update_ingress(tmp_namespace, "my-ingress", ingress)
84+
break
85+
except FakeConflictError:
86+
continue
87+
else:
88+
pytest.fail("Ingress update failed after 5 retries due to conflict")
8489
updated = k8s_client.get_ingress(tmp_namespace, "my-ingress")
8590
assert updated["spec"]["rules"][0]["host"] == "new.com"
8691

@@ -102,8 +107,20 @@ class TestIngressDelete:
102107
def test_delete_removes(self, k8s_client, tmp_namespace) -> None:
103108
k8s_client.create_ingress(tmp_namespace, _ingress_body("my-ingress"))
104109
k8s_client.delete_ingress(tmp_namespace, "my-ingress")
105-
with pytest.raises(FakeNotFoundError):
106-
k8s_client.get_ingress(tmp_namespace, "my-ingress")
110+
111+
def _get_or_none() -> dict[str, Any] | None:
112+
try:
113+
return k8s_client.get_ingress(tmp_namespace, "my-ingress")
114+
except FakeNotFoundError:
115+
return None
116+
117+
poll_until(
118+
fn=_get_or_none,
119+
predicate=lambda result: result is None,
120+
timeout=30,
121+
interval=0.5,
122+
message="Ingress my-ingress still exists after deletion",
123+
)
107124

108125
def test_delete_not_found_raises(self, k8s_client, tmp_namespace) -> None:
109126
with pytest.raises(FakeNotFoundError):

0 commit comments

Comments
 (0)