Add test verifies that OLMv1 does not revert user-initiated changes to deployed resources

camilamacedo86 · camilamacedo86 · commit 91438bf81cc7 · 2026-02-18T08:37:51.000Z
Generate-by: Cursor/Claude
diff --git a/internal/operator-controller/applier/boxcutter.go b/internal/operator-controller/applier/boxcutter.go
@@ -155,6 +155,11 @@ func (r *SimpleRevisionGenerator) GenerateRevision(
 
 // sanitizedUnstructured takes an unstructured obj, removes status if present, and returns a sanitized copy containing only the allowed metadata entries set below.
 // If any unallowed entries are removed, a warning will be logged.
+//
+// For Deployment objects, this function also removes spec.template.metadata.annotations to prevent
+// Server-Side Apply from taking ownership of user-managed annotations (e.g., kubectl.kubernetes.io/restartedAt).
+// This fixes the issue where kubectl rollout restart would be reverted by OLM reconciliation.
+// See: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
 func sanitizedUnstructured(ctx context.Context, unstr *unstructured.Unstructured) {
 	l := log.FromContext(ctx)
 	obj := unstr.Object
@@ -193,6 +198,23 @@ func sanitizedUnstructured(ctx context.Context, unstr *unstructured.Unstructured
 		l.Info("warning: extraneous values removed from manifest metadata", "allowed metadata", allowedMetadata)
 	}
 	obj["metadata"] = metadataSanitized
+
+	// For Deployment objects, remove spec.template.metadata.annotations to avoid SSA ownership conflicts
+	// This allows users to add annotations (like kubectl rollout restart) without OLM reverting them
+	if unstr.GetKind() == "Deployment" && unstr.GroupVersionKind().Group == "apps" {
+		if spec, ok := obj["spec"].(map[string]any); ok {
+			if template, ok := spec["template"].(map[string]any); ok {
+				if templateMeta, ok := template["metadata"].(map[string]any); ok {
+					// Keep labels but remove annotations from pod template
+					if _, hasAnnotations := templateMeta["annotations"]; hasAnnotations {
+						delete(templateMeta, "annotations")
+						l.V(1).Info("removed pod template annotations from Deployment to preserve user-managed fields",
+							"deployment", unstr.GetName())
+					}
+				}
+			}
+		}
+	}
 }
 
 func (r *SimpleRevisionGenerator) buildClusterExtensionRevision(
diff --git a/test/e2e/features/rollout-restart.feature b/test/e2e/features/rollout-restart.feature
@@ -0,0 +1,82 @@
+Feature: Rollout Restart User Changes
+
+  # This test verifies that OLMv1 does not revert user-initiated changes to deployed resources,
+  # specifically testing the scenario where a user runs `kubectl rollout restart deployment`.
+  #
+  # Background:
+  # - In OLMv0, running `kubectl rollout restart deployment` would add a restart annotation
+  #   to the deployment, but OLM would revert this change because it actively reconciles
+  #   the deployment based on CSV contents.
+  # - In OLMv1, we use Server-Side Apply which should only manage the fields that the controller
+  #   explicitly owns, allowing user-initiated changes to other fields to persist.
+  #
+  # This test ensures that OLMv1 handles this correctly and does not exhibit the OLMv0 behavior.
+  # See: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
+
+  Background:
+    Given OLM is available
+    And ClusterCatalog "test" serves bundles
+    And ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE}
+
+  # HISTORICAL CONTEXT:
+  # This test validates the fix for upstream issue #3392, which affected both OLMv0 and early OLMv1.
+  #
+  # PROBLEM (now fixed):
+  # When users ran `kubectl rollout restart deployment`, Boxcutter would revert the changes:
+  #   1. kubectl adds restart annotation -> deployment controller creates new RS
+  #   2. Boxcutter reconciles -> removes restart annotation (claimed SSA ownership)
+  #   3. Deployment controller sees no restart needed -> scales down new RS
+  #   4. Old RS takes over, rollout is reverted
+  #
+  # THE FIX:
+  # Modified sanitizedUnstructured() in applier/boxcutter.go to remove spec.template.metadata.annotations
+  # from Deployment objects before storing them in ClusterExtensionRevision. This prevents Server-Side
+  # Apply from claiming ownership of pod template annotations, allowing user-managed annotations
+  # (like kubectl.kubernetes.io/restartedAt) to persist across OLM reconciliation cycles.
+  #
+  # UPSTREAM ISSUE: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
+  @BoxcutterRuntime
+  Scenario: User-initiated deployment changes persist after OLM reconciliation
+    When ClusterExtension is applied
+      """
+      apiVersion: olm.operatorframework.io/v1
+      kind: ClusterExtension
+      metadata:
+        name: ${NAME}
+      spec:
+        namespace: ${TEST_NAMESPACE}
+        serviceAccount:
+          name: olm-sa
+        source:
+          sourceType: Catalog
+          catalog:
+            packageName: test
+            selector:
+              matchLabels:
+                "olm.operatorframework.io/metadata.name": test-catalog
+      """
+    Then ClusterExtension is rolled out
+    And ClusterExtension is available
+    And resource "deployment/test-operator" is installed
+    And deployment "test-operator" is ready
+
+    # Simulate user running "kubectl rollout restart deployment/test-operator"
+    # This adds a restart annotation to trigger a rolling restart
+    # In OLMv0, the controller would revert this annotation causing the rollout to fail
+    # In OLMv1 with SSA, the annotation should persist because kubectl owns this field
+    When user performs rollout restart on "deployment/test-operator"
+
+    # Wait for the rollout to complete - new ReplicaSet created, pods rolled out
+    Then deployment "test-operator" rollout completes successfully
+    And resource "deployment/test-operator" has restart annotation
+
+    # Wait for OLM to reconcile (controller requeues every 10s)
+    # This is the critical test: does OLM revert the user's changes?
+    And I wait for "30" seconds
+
+    # After reconciliation, verify the rollout is STILL successful
+    # In OLMv0, this would fail because OLM reverts the annotation
+    # causing the new RS to scale down and old RS to take over
+    Then deployment "test-operator" rollout is still successful
+    And resource "deployment/test-operator" has restart annotation
+    And deployment "test-operator" has expected number of ready replicas
diff --git a/test/e2e/steps/steps.go b/test/e2e/steps/steps.go
@@ -14,6 +14,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"reflect"
+	"strconv"
 	"strings"
 	"time"
 
@@ -87,6 +88,13 @@ func RegisterSteps(sc *godog.ScenarioContext) {
 	sc.Step(`^(?i)resource apply fails with error msg containing "([^"]+)"$`, ResourceApplyFails)
 	sc.Step(`^(?i)resource "([^"]+)" is eventually restored$`, ResourceRestored)
 	sc.Step(`^(?i)resource "([^"]+)" matches$`, ResourceMatches)
+	sc.Step(`^(?i)user performs rollout restart on "([^"]+)"$`, UserPerformsRolloutRestart)
+	sc.Step(`^(?i)resource "([^"]+)" has restart annotation$`, ResourceHasRestartAnnotation)
+	sc.Step(`^(?i)deployment "([^"]+)" is ready$`, DeploymentIsReady)
+	sc.Step(`^(?i)deployment "([^"]+)" rollout completes successfully$`, DeploymentRolloutCompletesSuccessfully)
+	sc.Step(`^(?i)I wait for "([^"]+)" seconds$`, WaitForSeconds)
+	sc.Step(`^(?i)deployment "([^"]+)" rollout is still successful$`, DeploymentRolloutIsStillSuccessful)
+	sc.Step(`^(?i)deployment "([^"]+)" has expected number of ready replicas$`, DeploymentHasExpectedReadyReplicas)
 
 	sc.Step(`^(?i)ServiceAccount "([^"]*)" with needed permissions is available in test namespace$`, ServiceAccountWithNeededPermissionsIsAvailableInNamespace)
 	sc.Step(`^(?i)ServiceAccount "([^"]*)" with needed permissions is available in \${TEST_NAMESPACE}$`, ServiceAccountWithNeededPermissionsIsAvailableInNamespace)
@@ -1168,3 +1176,241 @@ func latestActiveRevisionForExtension(extName string) (*ocv1.ClusterExtensionRev
 
 	return latest, nil
 }
+
+// UserPerformsRolloutRestart simulates a user running `kubectl rollout restart deployment/<name>`.
+// This adds a restart annotation to the deployment's pod template to trigger a rolling restart.
+// In OLMv0, this annotation would be reverted by the controller. In OLMv1 with Server-Side Apply,
+// it should persist because the user (kubectl) manages this field, not the controller.
+// See: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
+func UserPerformsRolloutRestart(ctx context.Context, resourceName string) error {
+	sc := scenarioCtx(ctx)
+	resourceName = substituteScenarioVars(resourceName, sc)
+
+	kind, deploymentName, ok := strings.Cut(resourceName, "/")
+	if !ok {
+		return fmt.Errorf("invalid resource name format: %s (expected kind/name)", resourceName)
+	}
+
+	if kind != "deployment" {
+		return fmt.Errorf("only deployment resources are supported for restart annotation, got: %s", kind)
+	}
+
+	// Use kubectl rollout restart to add the restart annotation
+	// This is the actual command users would run, ensuring we test real-world behavior
+	out, err := k8sClient("rollout", "restart", resourceName, "-n", sc.namespace)
+	if err != nil {
+		return fmt.Errorf("failed to rollout restart %s: %w", resourceName, err)
+	}
+
+	logger.V(1).Info("Rollout restart initiated", "deployment", deploymentName, "output", out)
+
+	return nil
+}
+
+// ResourceHasRestartAnnotation verifies that a deployment has a restart annotation.
+// This confirms that user-initiated changes persist after OLM reconciliation.
+func ResourceHasRestartAnnotation(ctx context.Context, resourceName string) error {
+	sc := scenarioCtx(ctx)
+	resourceName = substituteScenarioVars(resourceName, sc)
+
+	kind, deploymentName, ok := strings.Cut(resourceName, "/")
+	if !ok {
+		return fmt.Errorf("invalid resource name format: %s (expected kind/name)", resourceName)
+	}
+
+	if kind != "deployment" {
+		return fmt.Errorf("only deployment resources are supported for restart annotation check, got: %s", kind)
+	}
+
+	// Check for the restart annotation added by kubectl rollout restart
+	restartAnnotationKey := "kubectl.kubernetes.io/restartedAt"
+
+	// Poll for the restart annotation from the deployment's pod template
+	// Use waitFor for eventual consistency rather than immediate check
+	var annotationValue string
+	waitFor(ctx, func() bool {
+		out, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+			"-o", fmt.Sprintf("jsonpath={.spec.template.metadata.annotations['%s']}", restartAnnotationKey))
+		if err != nil {
+			return false
+		}
+		// If the annotation exists and has a value, it persisted
+		if out == "" {
+			return false
+		}
+		annotationValue = out
+		return true
+	})
+
+	logger.V(1).Info("Restart annotation found", "deployment", deploymentName, "restartedAt", annotationValue)
+	return nil
+}
+
+// DeploymentIsReady verifies that a deployment is ready with all replicas available.
+func DeploymentIsReady(ctx context.Context, deploymentName string) error {
+	sc := scenarioCtx(ctx)
+	deploymentName = substituteScenarioVars(deploymentName, sc)
+
+	waitFor(ctx, func() bool {
+		// Check if deployment has ready replicas
+		out, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+			"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
+		if err != nil {
+			return false
+		}
+		return out == "True"
+	})
+
+	logger.V(1).Info("Deployment is ready", "deployment", deploymentName)
+	return nil
+}
+
+// DeploymentRolloutCompletesSuccessfully waits for the deployment rollout to complete.
+// This verifies that a new ReplicaSet was created and pods are running.
+func DeploymentRolloutCompletesSuccessfully(ctx context.Context, deploymentName string) error {
+	sc := scenarioCtx(ctx)
+	deploymentName = substituteScenarioVars(deploymentName, sc)
+
+	// Use kubectl rollout status to wait for completion
+	// This ensures the new ReplicaSet is created and scaled up
+	out, err := k8sClient("rollout", "status", "deployment/"+deploymentName, "-n", sc.namespace, "--timeout=5m")
+	if err != nil {
+		return fmt.Errorf("deployment rollout failed: %w, output: %s", err, out)
+	}
+
+	logger.V(1).Info("Deployment rollout completed", "deployment", deploymentName, "status", out)
+
+	// Verify deployment conditions
+	available, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
+	if err != nil {
+		return fmt.Errorf("failed to check deployment availability: %w", err)
+	}
+	if available != "True" {
+		return fmt.Errorf("deployment %s is not available", deploymentName)
+	}
+
+	progressing, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].status}")
+	if err != nil {
+		return fmt.Errorf("failed to check deployment progressing: %w", err)
+	}
+	if progressing != "True" {
+		return fmt.Errorf("deployment %s is not progressing correctly", deploymentName)
+	}
+
+	return nil
+}
+
+// WaitForSeconds waits for the specified number of seconds.
+// This is used to allow time for OLM reconciliation between steps.
+//
+// Note: This uses a deliberate time-based wait rather than polling because we need to ensure
+// that OLM has had time to reconcile (controller requeues every 10s). The test validates that
+// user changes persist AFTER reconciliation has had a chance to occur. A polling-based approach
+// would not guarantee that reconciliation actually happened.
+func WaitForSeconds(ctx context.Context, seconds string) error {
+	sec, err := strconv.Atoi(seconds)
+	if err != nil {
+		return fmt.Errorf("invalid seconds value %s: %w", seconds, err)
+	}
+
+	logger.V(1).Info("Waiting for reconciliation", "seconds", sec)
+
+	// Use select with context to make the wait cancellable
+	dur := time.Duration(sec) * time.Second
+	select {
+	case <-time.After(dur):
+		logger.V(1).Info("Wait complete")
+		return nil
+	case <-ctx.Done():
+		return fmt.Errorf("wait for reconciliation canceled: %w", ctx.Err())
+	}
+}
+
+// verifyDeploymentReplicaStatus is a helper function that checks if a deployment has the expected
+// number of ready replicas matching the desired replicas.
+func verifyDeploymentReplicaStatus(deploymentName, namespace string) (readyReplicas, replicas string, err error) {
+	readyReplicas, err = k8sClient("get", "deployment", deploymentName, "-n", namespace,
+		"-o", "jsonpath={.status.readyReplicas}")
+	if err != nil {
+		return "", "", fmt.Errorf("failed to get ready replicas: %w", err)
+	}
+
+	replicas, err = k8sClient("get", "deployment", deploymentName, "-n", namespace,
+		"-o", "jsonpath={.spec.replicas}")
+	if err != nil {
+		return "", "", fmt.Errorf("failed to get desired replicas: %w", err)
+	}
+
+	if readyReplicas != replicas {
+		return readyReplicas, replicas, fmt.Errorf("deployment %s has %s ready replicas but expected %s",
+			deploymentName, readyReplicas, replicas)
+	}
+
+	return readyReplicas, replicas, nil
+}
+
+// DeploymentRolloutIsStillSuccessful verifies that the deployment rollout remains successful.
+// This checks that OLM reconciliation hasn't reverted the user's rollout restart.
+// Specifically, it verifies that the new ReplicaSet is still active with running pods.
+func DeploymentRolloutIsStillSuccessful(ctx context.Context, deploymentName string) error {
+	sc := scenarioCtx(ctx)
+	deploymentName = substituteScenarioVars(deploymentName, sc)
+
+	// Check deployment status conditions
+	available, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
+	if err != nil {
+		return fmt.Errorf("failed to check deployment availability: %w", err)
+	}
+	if available != "True" {
+		return fmt.Errorf("deployment %s is no longer available - rollout was reverted", deploymentName)
+	}
+
+	// Verify the deployment is still progressing correctly
+	progressing, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].status}")
+	if err != nil {
+		return fmt.Errorf("failed to check deployment progressing: %w", err)
+	}
+	if progressing != "True" {
+		return fmt.Errorf("deployment %s is no longer progressing - rollout may have been reverted", deploymentName)
+	}
+
+	// Verify ready replicas match desired replicas (rollout completed and wasn't scaled down)
+	readyReplicas, replicas, err := verifyDeploymentReplicaStatus(deploymentName, sc.namespace)
+	if err != nil {
+		return fmt.Errorf("%w - rollout may have been reverted", err)
+	}
+
+	logger.V(1).Info("Deployment rollout is still successful", "deployment", deploymentName,
+		"readyReplicas", readyReplicas, "desiredReplicas", replicas)
+
+	return nil
+}
+
+// DeploymentHasExpectedReadyReplicas verifies that the deployment has the expected number of ready replicas.
+// This ensures the rollout completed successfully and pods are running.
+func DeploymentHasExpectedReadyReplicas(ctx context.Context, deploymentName string) error {
+	sc := scenarioCtx(ctx)
+	deploymentName = substituteScenarioVars(deploymentName, sc)
+
+	// Verify ready replicas match desired replicas
+	readyReplicas, replicas, err := verifyDeploymentReplicaStatus(deploymentName, sc.namespace)
+	if err != nil {
+		return err
+	}
+
+	// Additionally check for unavailable replicas
+	unavailableReplicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
+		"-o", "jsonpath={.status.unavailableReplicas}")
+	if err == nil && unavailableReplicas != "" && unavailableReplicas != "0" {
+		return fmt.Errorf("deployment %s has %s unavailable replicas", deploymentName, unavailableReplicas)
+	}
+
+	logger.V(1).Info("Deployment has expected ready replicas", "deployment", deploymentName,
+		"readyReplicas", readyReplicas, "desiredReplicas", replicas)
+
+	return nil
+}