Add test verifies that OLMv1 does not revert user-initiated changes to deployed resources

camilamacedo86 · camilamacedo86 · commit 741afe1ed735 · 2026-02-18T08:27:44.000Z
Generate-by: Cursor/Claude
diff --git a/test/e2e/features/rollout-restart.feature b/test/e2e/features/rollout-restart.feature
@@ -0,0 +1,101 @@
+Feature: Rollout Restart User Changes
+
+  # This test verifies that OLMv1 does not revert user-initiated changes to deployed resources,
+  # specifically testing the scenario where a user runs `kubectl rollout restart deployment`.
+  #
+  # Background:
+  # - In OLMv0, running `kubectl rollout restart deployment` would add a restart annotation
+  #   to the deployment, but OLM would revert this change because it actively reconciles
+  #   the deployment based on CSV contents.
+  # - In OLMv1, we use Server-Side Apply which should only manage the fields that the controller
+  #   explicitly owns, allowing user-initiated changes to other fields to persist.
+  #
+  # This test ensures that OLMv1 handles this correctly and does not exhibit the OLMv0 behavior.
+  # See: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
+
+  Background:
+    Given OLM is available
+    And ClusterCatalog "test" serves bundles
+    And ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE}
+
+  # KNOWN BUG: Boxcutter currently reverts user-initiated deployment changes
+  #
+  # SYMPTOM (as described in upstream issue #3392):
+  # When a user runs `kubectl rollout restart deployment`, a new ReplicaSet is created
+  # but it gets immediately scaled down (1 -> 0) and the old ReplicaSet takes over.
+  # The deployment status shows:
+  #   - OldReplicaSets: deployment-xyz (0/0 replicas)
+  #   - NewReplicaSet: deployment-abc (1/1 replicas) <- This gets scaled down
+  # Expected behavior: New RS should stay up with running pods, old RS scaled to 0.
+  #
+  # ROOT CAUSE:
+  # When Boxcutter (pkg.package-operator.run/boxcutter v0.10.0) reconciles a
+  # ClusterExtensionRevision, it applies the manifest stored in the revision spec.
+  # This manifest snapshot was taken when the bundle was unpacked and does NOT
+  # include user-added annotations like the restart timestamp from `kubectl rollout restart`.
+  #
+  # Because Boxcutter applies this full manifest (including the annotations field),
+  # it takes ownership of the entire field via Server-Side Apply, overwriting any
+  # annotations added by other field managers (like kubectl). This causes:
+  #   1. kubectl adds restart annotation -> deployment controller creates new RS
+  #   2. Boxcutter reconciles -> removes restart annotation
+  #   3. Deployment controller sees no restart needed -> scales down new RS
+  #   4. Old RS takes over, rollout is reverted
+  #
+  # EXPECTED BEHAVIOR (not yet implemented):
+  # The controller should only manage the fields it explicitly cares about, allowing
+  # user-managed fields (like restart annotations) to persist across reconciliation.
+  #
+  # This is the same issue that existed in OLMv0 (see upstream issue below) and needs
+  # to be fixed in the OLMv1 Boxcutter implementation.
+  #
+  # UPSTREAM ISSUE: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
+  #
+  # TODO: Fix the Boxcutter implementation to properly use Server-Side Apply field
+  # ownership so user changes persist, then remove @skip tag
+  @skip
+  @BoxcutterRuntime
+  Scenario: User-initiated deployment changes persist after OLM reconciliation
+    When ClusterExtension is applied
+      """
+      apiVersion: olm.operatorframework.io/v1
+      kind: ClusterExtension
+      metadata:
+        name: ${NAME}
+      spec:
+        namespace: ${TEST_NAMESPACE}
+        serviceAccount:
+          name: olm-sa
+        source:
+          sourceType: Catalog
+          catalog:
+            packageName: test
+            selector:
+              matchLabels:
+                "olm.operatorframework.io/metadata.name": test-catalog
+      """
+    Then ClusterExtension is rolled out
+    And ClusterExtension is available
+    And resource "deployment/test-operator" is installed
+    And deployment "test-operator" is ready
+
+    # Simulate user running "kubectl rollout restart deployment/test-operator"
+    # This adds a restart annotation to trigger a rolling restart
+    # In OLMv0, the controller would revert this annotation causing the rollout to fail
+    # In OLMv1 with SSA, the annotation should persist because kubectl owns this field
+    When user performs rollout restart on "deployment/test-operator"
+
+    # Wait for the rollout to complete - new ReplicaSet created, pods rolled out
+    Then deployment "test-operator" rollout completes successfully
+    And resource "deployment/test-operator" has restart annotation
+
+    # Wait for OLM to reconcile (controller requeues every 10s)
+    # This is the critical test: does OLM revert the user's changes?
+    And I wait for "30" seconds
+
+    # After reconciliation, verify the rollout is STILL successful
+    # In OLMv0, this would fail because OLM reverts the annotation
+    # causing the new RS to scale down and old RS to take over
+    Then deployment "test-operator" rollout is still successful
+    And resource "deployment/test-operator" has restart annotation
+    And deployment "test-operator" has expected number of ready replicas
diff --git a/test/e2e/steps/steps.go b/test/e2e/steps/steps.go
@@ -14,6 +14,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"reflect"
+	"strconv"
 	"strings"
 	"time"
 
@@ -87,6 +88,13 @@ func RegisterSteps(sc *godog.ScenarioContext) {
 	sc.Step(`^(?i)resource apply fails with error msg containing "([^"]+)"$`, ResourceApplyFails)
 	sc.Step(`^(?i)resource "([^"]+)" is eventually restored$`, ResourceRestored)
 	sc.Step(`^(?i)resource "([^"]+)" matches$`, ResourceMatches)
+	sc.Step(`^(?i)user performs rollout restart on "([^"]+)"$`, UserPerformsRolloutRestart)
+	sc.Step(`^(?i)resource "([^"]+)" has restart annotation$`, ResourceHasRestartAnnotation)
+	sc.Step(`^(?i)deployment "([^"]+)" is ready$`, DeploymentIsReady)
+	sc.Step(`^(?i)deployment "([^"]+)" rollout completes successfully$`, DeploymentRolloutCompletesSuccessfully)
+	sc.Step(`^(?i)I wait for "([^"]+)" seconds$`, WaitForSeconds)
+	sc.Step(`^(?i)deployment "([^"]+)" rollout is still successful$`, DeploymentRolloutIsStillSuccessful)
+	sc.Step(`^(?i)deployment "([^"]+)" has expected number of ready replicas$`, DeploymentHasExpectedReadyReplicas)
 
 	sc.Step(`^(?i)ServiceAccount "([^"]*)" with needed permissions is available in test namespace$`, ServiceAccountWithNeededPermissionsIsAvailableInNamespace)
 	sc.Step(`^(?i)ServiceAccount "([^"]*)" with needed permissions is available in \${TEST_NAMESPACE}$`, ServiceAccountWithNeededPermissionsIsAvailableInNamespace)
@@ -1168,3 +1176,233 @@ func latestActiveRevisionForExtension(extName string) (*ocv1.ClusterExtensionRev
 
 	return latest, nil
 }
+
+// UserPerformsRolloutRestart simulates a user running `kubectl rollout restart deployment/<name>`.
+// This adds a restart annotation to the deployment's pod template to trigger a rolling restart.
+// In OLMv0, this annotation would be reverted by the controller. In OLMv1 with Server-Side Apply,
+// it should persist because the user (kubectl) manages this field, not the controller.
+// See: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
+func UserPerformsRolloutRestart(ctx context.Context, resourceName string) error {
+	sc := scenarioCtx(ctx)
+	resourceName = substituteScenarioVars(resourceName, sc)
+
+	kind, deploymentName, ok := strings.Cut(resourceName, "/")
+	if !ok {
+		return fmt.Errorf("invalid resource name format: %s (expected kind/name)", resourceName)
+	}
+
+	if kind != "deployment" {
+		return fmt.Errorf("only deployment resources are supported for restart annotation, got: %s", kind)
+	}
+
+	// Capture current generation before restart
+	beforeGen, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.metadata.generation}")
+	if err != nil {
+		return fmt.Errorf("failed to get deployment generation before restart: %w", err)
+	}
+
+	logger.V(1).Info("Deployment state before rollout restart", "deployment", deploymentName, "generation", beforeGen)
+
+	// Use kubectl rollout restart to add the restart annotation
+	// This is the actual command users would run, ensuring we test real-world behavior
+	out, err := k8sClient("rollout", "restart", resourceName, "-n", sc.namespace)
+	if err != nil {
+		return fmt.Errorf("failed to rollout restart %s: %w", resourceName, err)
+	}
+
+	logger.V(1).Info("Rollout restart initiated", "deployment", deploymentName, "output", out)
+
+	return nil
+}
+
+// ResourceHasRestartAnnotation verifies that a deployment has a restart annotation.
+// This confirms that user-initiated changes persist after OLM reconciliation.
+func ResourceHasRestartAnnotation(ctx context.Context, resourceName string) error {
+	sc := scenarioCtx(ctx)
+	resourceName = substituteScenarioVars(resourceName, sc)
+
+	kind, deploymentName, ok := strings.Cut(resourceName, "/")
+	if !ok {
+		return fmt.Errorf("invalid resource name format: %s (expected kind/name)", resourceName)
+	}
+
+	if kind != "deployment" {
+		return fmt.Errorf("only deployment resources are supported for restart annotation check, got: %s", kind)
+	}
+
+	// Check for the restart annotation added by kubectl rollout restart
+	restartAnnotationKey := "kubectl.kubernetes.io/restartedAt"
+
+	// Get the restart annotation from the deployment's pod template
+	out, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", fmt.Sprintf("jsonpath={.spec.template.metadata.annotations['%s']}", restartAnnotationKey))
+	if err != nil {
+		return fmt.Errorf("failed to get restart annotation: %w", err)
+	}
+
+	// If the annotation exists and has a value, it persisted
+	if out == "" {
+		return fmt.Errorf("restart annotation not found on deployment %s", deploymentName)
+	}
+
+	logger.V(1).Info("Restart annotation found", "deployment", deploymentName, "restartedAt", out)
+	return nil
+}
+
+// DeploymentIsReady verifies that a deployment is ready with all replicas available.
+func DeploymentIsReady(ctx context.Context, deploymentName string) error {
+	sc := scenarioCtx(ctx)
+	deploymentName = substituteScenarioVars(deploymentName, sc)
+
+	waitFor(ctx, func() bool {
+		// Check if deployment has ready replicas
+		out, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+			"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
+		if err != nil {
+			return false
+		}
+		return out == "True"
+	})
+
+	logger.V(1).Info("Deployment is ready", "deployment", deploymentName)
+	return nil
+}
+
+// DeploymentRolloutCompletesSuccessfully waits for the deployment rollout to complete.
+// This verifies that a new ReplicaSet was created and pods are running.
+func DeploymentRolloutCompletesSuccessfully(ctx context.Context, deploymentName string) error {
+	sc := scenarioCtx(ctx)
+	deploymentName = substituteScenarioVars(deploymentName, sc)
+
+	// Use kubectl rollout status to wait for completion
+	// This ensures the new ReplicaSet is created and scaled up
+	out, err := k8sClient("rollout", "status", "deployment/"+deploymentName, "-n", sc.namespace, "--timeout=5m")
+	if err != nil {
+		return fmt.Errorf("deployment rollout failed: %w, output: %s", err, out)
+	}
+
+	logger.V(1).Info("Deployment rollout completed", "deployment", deploymentName, "status", out)
+
+	// Verify deployment conditions
+	available, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
+	if err != nil {
+		return fmt.Errorf("failed to check deployment availability: %w", err)
+	}
+	if available != "True" {
+		return fmt.Errorf("deployment %s is not available", deploymentName)
+	}
+
+	progressing, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].status}")
+	if err != nil {
+		return fmt.Errorf("failed to check deployment progressing: %w", err)
+	}
+	if progressing != "True" {
+		return fmt.Errorf("deployment %s is not progressing correctly", deploymentName)
+	}
+
+	return nil
+}
+
+// WaitForSeconds waits for the specified number of seconds.
+// This is used to allow time for OLM reconciliation between steps.
+func WaitForSeconds(ctx context.Context, seconds string) error {
+	sec, err := strconv.Atoi(seconds)
+	if err != nil {
+		return fmt.Errorf("invalid seconds value %s: %w", seconds, err)
+	}
+
+	logger.V(1).Info("Waiting for reconciliation", "seconds", sec)
+	time.Sleep(time.Duration(sec) * time.Second)
+	logger.V(1).Info("Wait complete")
+
+	return nil
+}
+
+// DeploymentRolloutIsStillSuccessful verifies that the deployment rollout remains successful.
+// This checks that OLM reconciliation hasn't reverted the user's rollout restart.
+// Specifically, it verifies that the new ReplicaSet is still active with running pods.
+func DeploymentRolloutIsStillSuccessful(ctx context.Context, deploymentName string) error {
+	sc := scenarioCtx(ctx)
+	deploymentName = substituteScenarioVars(deploymentName, sc)
+
+	// Check deployment status conditions
+	available, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
+	if err != nil {
+		return fmt.Errorf("failed to check deployment availability: %w", err)
+	}
+	if available != "True" {
+		return fmt.Errorf("deployment %s is no longer available - rollout was reverted", deploymentName)
+	}
+
+	// Verify the deployment is still progressing correctly
+	progressing, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].status}")
+	if err != nil {
+		return fmt.Errorf("failed to check deployment progressing: %w", err)
+	}
+	if progressing != "True" {
+		return fmt.Errorf("deployment %s is no longer progressing - rollout may have been reverted", deploymentName)
+	}
+
+	// Verify ready replicas match desired replicas (rollout completed and wasn't scaled down)
+	readyReplicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.status.readyReplicas}")
+	if err != nil {
+		return fmt.Errorf("failed to get ready replicas: %w", err)
+	}
+
+	replicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.spec.replicas}")
+	if err != nil {
+		return fmt.Errorf("failed to get desired replicas: %w", err)
+	}
+
+	if readyReplicas != replicas {
+		return fmt.Errorf("deployment %s has %s ready replicas but expected %s - rollout may have been reverted",
+			deploymentName, readyReplicas, replicas)
+	}
+
+	logger.V(1).Info("Deployment rollout is still successful", "deployment", deploymentName,
+		"readyReplicas", readyReplicas, "desiredReplicas", replicas)
+
+	return nil
+}
+
+// DeploymentHasExpectedReadyReplicas verifies that the deployment has the expected number of ready replicas.
+// This ensures the rollout completed successfully and pods are running.
+func DeploymentHasExpectedReadyReplicas(ctx context.Context, deploymentName string) error {
+	sc := scenarioCtx(ctx)
+	deploymentName = substituteScenarioVars(deploymentName, sc)
+
+	readyReplicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.status.readyReplicas}")
+	if err != nil {
+		return fmt.Errorf("failed to get ready replicas: %w", err)
+	}
+
+	replicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.spec.replicas}")
+	if err != nil {
+		return fmt.Errorf("failed to get desired replicas: %w", err)
+	}
+
+	if readyReplicas != replicas {
+		return fmt.Errorf("deployment %s has %s ready replicas but expected %s",
+			deploymentName, readyReplicas, replicas)
+	}
+
+	unavailableReplicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.status.unavailableReplicas}")
+	if err == nil && unavailableReplicas != "" && unavailableReplicas != "0" {
+		return fmt.Errorf("deployment %s has %s unavailable replicas", deploymentName, unavailableReplicas)
+	}
+
+	logger.V(1).Info("Deployment has expected ready replicas", "deployment", deploymentName,
+		"readyReplicas", readyReplicas, "desiredReplicas", replicas)
+
+	return nil
+}