feat(experimental): run catalogd and operator-controller with 2 replicas

tmshort · claude · tmshort · commit 99db2dc09e86 · 2026-04-24T16:03:04.000-04:00
The experimental e2e suite uses a 2-node kind cluster, making it a natural
fit to validate HA behaviour.  Set replicas=2 for both components in
helm/experimental.yaml so the experimental and experimental-e2e manifests
exercise the multi-replica path end-to-end.

This is safe for operator-controller (no leader-only HTTP servers) and for
catalogd now that the catalog server starts on all pods via
NeedLeaderElection=false, preventing the rolling-update deadlock that would
arise if the server were leader-only.

Also adds a @CatalogdHA experimental e2e scenario that force-deletes the
catalogd leader pod and verifies that a new leader is elected and the catalog
resumes serving.  The scenario is gated on a 2-node cluster (detected in
BeforeSuite and reflected in the featureGates map), so it is automatically
skipped in the standard 1-node e2e suite.  The experimental e2e timeout is
bumped from 20m to 25m to accommodate leader re-election time (~163s worst
case).

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: Todd Short &lt;tshort@redhat.com&gt;
diff --git a/Makefile b/Makefile
@@ -316,7 +316,7 @@ test-experimental-e2e: COVERAGE_NAME := experimental-e2e
 test-experimental-e2e: export MANIFEST := $(EXPERIMENTAL_RELEASE_MANIFEST)
 test-experimental-e2e: export INSTALL_DEFAULT_CATALOGS := false
 test-experimental-e2e: PROMETHEUS_VALUES := helm/prom_experimental.yaml
-test-experimental-e2e: E2E_TIMEOUT := 20m
+test-experimental-e2e: E2E_TIMEOUT := 25m
 test-experimental-e2e: run-internal prometheus e2e e2e-coverage kind-clean #HELP Run experimental e2e test suite on local kind cluster
 
 .PHONY: prometheus
diff --git a/helm/experimental.yaml b/helm/experimental.yaml
@@ -7,6 +7,8 @@
 # to pull in resources or additions
 options:
   operatorController:
+    deployment:
+      replicas: 2
     features:
       enabled:
         - SingleOwnNamespaceInstallSupport
@@ -20,6 +22,8 @@ options:
 # Use with {{- if has "FeatureGate" .Values.options.catalogd.features.enabled }}
 # to pull in resources or additions
   catalogd:
+    deployment:
+      replicas: 2
     features:
       enabled:
         - APIV1MetasHandler
diff --git a/internal/catalogd/serverutil/serverutil.go b/internal/catalogd/serverutil/serverutil.go
@@ -30,9 +30,10 @@ type CatalogServerConfig struct {
 }
 
 // AddCatalogServerToManager adds the catalog HTTP server to the manager and registers
-// a readiness check that passes only when this pod is the leader and actively serving.
-// The listener is created lazily inside Start() so non-leader pods never bind the port,
-// which ensures the readiness check correctly excludes them from Service endpoints.
+// a readiness check that passes once the server has started serving.  Because
+// NeedLeaderElection returns false, Start() is called on every pod immediately, so all
+// replicas bind the catalog port and become ready.  Non-leader pods serve requests but
+// return 404 (empty local cache); callers are expected to retry.
 func AddCatalogServerToManager(mgr ctrl.Manager, cfg CatalogServerConfig, cw *certwatcher.CertWatcher) error {
 	shutdownTimeout := 30 * time.Second
 	r := &catalogServerRunnable{
@@ -52,11 +53,10 @@ func AddCatalogServerToManager(mgr ctrl.Manager, cfg CatalogServerConfig, cw *ce
 		return fmt.Errorf("error adding catalog server to manager: %w", err)
 	}
 
-	// Register a readiness check that passes only once Start() has been called (i.e.
-	// this pod holds the leader lease and the catalog server is actively serving).
-	// Non-leader pods never reach Start(), so they remain not-ready and are excluded
-	// from Service endpoints — preventing catalog traffic from hitting a pod that
-	// isn't serving the catalog port.
+	// Register a readiness check that passes once Start() has been called and the
+	// server is actively serving.  All pods reach Start() (NeedLeaderElection=false),
+	// so all replicas become ready and receive traffic; non-leaders return 404 until
+	// they win the leader lease and populate their local cache.
 	if err := mgr.AddReadyzCheck("catalog-server", r.readyzCheck()); err != nil {
 		return fmt.Errorf("error adding catalog server readiness check: %w", err)
 	}
@@ -112,7 +112,8 @@ func (r *catalogServerRunnable) Start(ctx context.Context) error {
 			defer cancel()
 		}
 		if err := r.server.Shutdown(shutdownCtx); err != nil {
-			// Shutdown errors are logged by the manager; nothing actionable here.
+			// Shutdown errors (e.g. context deadline exceeded) are not actionable;
+			// the process is terminating regardless.
 			_ = err
 		}
 	}()
diff --git a/manifests/experimental-e2e.yaml b/manifests/experimental-e2e.yaml
@@ -2621,7 +2621,7 @@ metadata:
   namespace: olmv1-system
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
@@ -2772,7 +2772,7 @@ metadata:
   name: operator-controller-controller-manager
   namespace: olmv1-system
 spec:
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
diff --git a/manifests/experimental.yaml b/manifests/experimental.yaml
@@ -2541,7 +2541,7 @@ metadata:
   namespace: olmv1-system
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
@@ -2679,7 +2679,7 @@ metadata:
   name: operator-controller-controller-manager
   namespace: olmv1-system
 spec:
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
diff --git a/test/e2e/features/ha.feature b/test/e2e/features/ha.feature
@@ -0,0 +1,19 @@
+Feature: HA failover for catalogd
+
+  When catalogd is deployed with multiple replicas, the remaining pods must
+  elect a new leader and resume serving catalogs if the leader pod is lost.
+
+  Background:
+    Given OLM is available
+    And an image registry is available
+
+  @CatalogdHA
+  Scenario: Catalogd resumes serving catalogs after leader pod failure
+    Given a catalog "test" with packages:
+      | package | version | channel | replaces | contents                   |
+      | test    | 1.0.0   | stable  |          | CRD, Deployment, ConfigMap |
+    And catalogd is ready to reconcile resources
+    And catalog "test" is reconciled
+    When the catalogd leader pod is force-deleted
+    Then a new catalogd leader is elected
+    And catalog "test" reports Serving as True with Reason Available
diff --git a/test/e2e/steps/ha_steps.go b/test/e2e/steps/ha_steps.go
@@ -0,0 +1,63 @@
+package steps
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	"k8s.io/component-base/featuregate"
+)
+
+// catalogdHAFeature gates scenarios that require a multi-node cluster.
+// It is set to true in BeforeSuite when the cluster has at least 2 nodes,
+// which is the case for the experimental e2e suite (kind-config-2node.yaml)
+// but not the standard suite.
+const catalogdHAFeature featuregate.Feature = "CatalogdHA"
+
+// CatalogdLeaderPodIsForceDeleted force-deletes the catalogd leader pod to simulate leader loss.
+// The pod is identified from sc.leaderPods["catalogd"] (populated by a prior
+// "catalogd is ready to reconcile resources" step).  Force-deletion is equivalent to
+// an abrupt process crash: the lease is no longer renewed and the surviving pod
+// acquires leadership after the lease expires.
+//
+// Note: stopping the kind node container is not used here because both nodes in the
+// experimental 2-node cluster are control-plane nodes that run etcd — stopping either
+// would break etcd quorum and make the API server unreachable for the rest of the test.
+func CatalogdLeaderPodIsForceDeleted(ctx context.Context) error {
+	sc := scenarioCtx(ctx)
+	leaderPod := sc.leaderPods["catalogd"]
+	if leaderPod == "" {
+		return fmt.Errorf("catalogd leader pod not found in scenario context; run 'catalogd is ready to reconcile resources' first")
+	}
+
+	logger.Info("Force-deleting catalogd leader pod", "pod", leaderPod)
+	if _, err := k8sClient("delete", "pod", leaderPod, "-n", olmNamespace,
+		"--force", "--grace-period=0"); err != nil {
+		return fmt.Errorf("failed to force-delete catalogd leader pod %q: %w", leaderPod, err)
+	}
+	return nil
+}
+
+// NewCatalogdLeaderIsElected polls the catalogd leader election lease until the holder
+// identity changes to a pod other than the deleted leader.  It updates
+// sc.leaderPods["catalogd"] with the new leader pod name.
+func NewCatalogdLeaderIsElected(ctx context.Context) error {
+	sc := scenarioCtx(ctx)
+	oldLeader := sc.leaderPods["catalogd"]
+
+	waitFor(ctx, func() bool {
+		holder, err := k8sClient("get", "lease", leaseNames["catalogd"], "-n", olmNamespace,
+			"-o", "jsonpath={.spec.holderIdentity}")
+		if err != nil || holder == "" {
+			return false
+		}
+		newPod := strings.Split(strings.TrimSpace(holder), "_")[0]
+		if newPod == oldLeader {
+			return false
+		}
+		sc.leaderPods["catalogd"] = newPod
+		logger.Info("New catalogd leader elected", "pod", newPod)
+		return true
+	})
+	return nil
+}
diff --git a/test/e2e/steps/hooks.go b/test/e2e/steps/hooks.go
@@ -8,6 +8,7 @@ import (
 	"os/exec"
 	"regexp"
 	"strconv"
+	"strings"
 	"sync"
 
 	"github.com/cucumber/godog"
@@ -90,6 +91,7 @@ var (
 		features.HelmChartSupport:                  false,
 		features.BoxcutterRuntime:                  false,
 		features.DeploymentConfig:                  false,
+		catalogdHAFeature:                          false,
 	}
 	logger logr.Logger
 )
@@ -131,6 +133,14 @@ func BeforeSuite() {
 		logger = textlogger.NewLogger(textlogger.NewConfig())
 	}
 
+	// Enable HA scenarios when the cluster has at least 2 nodes.  This runs
+	// unconditionally so that upgrade scenarios (which install OLM in a Background
+	// step and return early below) still get the gate set correctly.
+	if out, err := k8sClient("get", "nodes", "--no-headers", "-o", "name"); err == nil &&
+		len(strings.Fields(strings.TrimSpace(out))) >= 2 {
+		featureGates[catalogdHAFeature] = true
+	}
+
 	olm, err := detectOLMDeployment()
 	if err != nil {
 		logger.Info("OLM deployments not found; skipping feature gate detection (upgrade scenarios will install OLM in Background)")
@@ -152,6 +162,7 @@ func BeforeSuite() {
 			}
 		}
 	}
+
 	logger.Info(fmt.Sprintf("Enabled feature gates: %v", featureGates))
 }
 
diff --git a/test/e2e/steps/steps.go b/test/e2e/steps/steps.go
@@ -194,6 +194,9 @@ func RegisterSteps(sc *godog.ScenarioContext) {
 	sc.Step(`^(?i)the "([^"]+)" component is configured with HTTPS_PROXY "([^"]+)"$`, ConfigureDeploymentWithHTTPSProxy)
 	sc.Step(`^(?i)the "([^"]+)" component is configured with HTTPS_PROXY pointing to a recording proxy$`, StartRecordingProxyAndConfigureDeployment)
 	sc.Step(`^(?i)the recording proxy received a CONNECT request for the catalogd service$`, RecordingProxyReceivedCONNECTForCatalogd)
+
+	sc.Step(`^(?i)the catalogd leader pod is force-deleted$`, CatalogdLeaderPodIsForceDeleted)
+	sc.Step(`^(?i)a new catalogd leader is elected$`, NewCatalogdLeaderIsElected)
 }
 
 func init() {

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@ import (`
`8`	`8`	`"os/exec"`
`9`	`9`	`"regexp"`
`10`	`10`	`"strconv"`
	`11`	`+ "strings"`
`11`	`12`	`"sync"`
`12`	`13`
`13`	`14`	`"github.com/cucumber/godog"`
`@@ -90,6 +91,7 @@ var (`
`90`	`91`	`features.HelmChartSupport: false,`
`91`	`92`	`features.BoxcutterRuntime: false,`
`92`	`93`	`features.DeploymentConfig: false,`
	`94`	`+ catalogdHAFeature: false,`
`93`	`95`	`}`
`94`	`96`	`logger logr.Logger`
`95`	`97`	`)`
`@@ -131,6 +133,14 @@ func BeforeSuite() {`
`131`	`133`	`logger = textlogger.NewLogger(textlogger.NewConfig())`
`132`	`134`	`}`
`133`	`135`
	`136`	`+ // Enable HA scenarios when the cluster has at least 2 nodes. This runs`
	`137`	`+ // unconditionally so that upgrade scenarios (which install OLM in a Background`
	`138`	`+ // step and return early below) still get the gate set correctly.`
	`139`	`+ if out, err := k8sClient("get", "nodes", "--no-headers", "-o", "name"); err == nil &&`
	`140`	`+ len(strings.Fields(strings.TrimSpace(out))) >= 2 {`
	`141`	`+ featureGates[catalogdHAFeature] = true`
	`142`	`+ }`
	`143`	`+`
`134`	`144`	`olm, err := detectOLMDeployment()`
`135`	`145`	`if err != nil {`
`136`	`146`	`logger.Info("OLM deployments not found; skipping feature gate detection (upgrade scenarios will install OLM in Background)")`
`@@ -152,6 +162,7 @@ func BeforeSuite() {`
`152`	`162`	`}`
`153`	`163`	`}`
`154`	`164`	`}`
	`165`	`+`
`155`	`166`	`logger.Info(fmt.Sprintf("Enabled feature gates: %v", featureGates))`
`156`	`167`	`}`
`157`	`168`
Original file line number	Diff line number	Diff line change
`@@ -194,6 +194,9 @@ func RegisterSteps(sc *godog.ScenarioContext) {`
`194`	`194`	sc.Step(`^(?i)the "([^"]+)" component is configured with HTTPS_PROXY "([^"]+)"$`, ConfigureDeploymentWithHTTPSProxy)
`195`	`195`	sc.Step(`^(?i)the "([^"]+)" component is configured with HTTPS_PROXY pointing to a recording proxy$`, StartRecordingProxyAndConfigureDeployment)
`196`	`196`	sc.Step(`^(?i)the recording proxy received a CONNECT request for the catalogd service$`, RecordingProxyReceivedCONNECTForCatalogd)
	`197`	`+`
	`198`	+ sc.Step(`^(?i)the catalogd leader pod is force-deleted$`, CatalogdLeaderPodIsForceDeleted)
	`199`	+ sc.Step(`^(?i)a new catalogd leader is elected$`, NewCatalogdLeaderIsElected)
`197`	`200`	`}`
`198`	`201`
`199`	`202`	`func init() {`