Skip to content

Commit 91438bf

Browse files
Add test verifies that OLMv1 does not revert user-initiated changes to deployed resources
Generate-by: Cursor/Claude
1 parent 10c2841 commit 91438bf

File tree

3 files changed

+350
-0
lines changed

3 files changed

+350
-0
lines changed

internal/operator-controller/applier/boxcutter.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,11 @@ func (r *SimpleRevisionGenerator) GenerateRevision(
155155

156156
// sanitizedUnstructured takes an unstructured obj, removes status if present, and returns a sanitized copy containing only the allowed metadata entries set below.
157157
// If any unallowed entries are removed, a warning will be logged.
158+
//
159+
// For Deployment objects, this function also removes spec.template.metadata.annotations to prevent
160+
// Server-Side Apply from taking ownership of user-managed annotations (e.g., kubectl.kubernetes.io/restartedAt).
161+
// This fixes the issue where kubectl rollout restart would be reverted by OLM reconciliation.
162+
// See: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
158163
func sanitizedUnstructured(ctx context.Context, unstr *unstructured.Unstructured) {
159164
l := log.FromContext(ctx)
160165
obj := unstr.Object
@@ -193,6 +198,23 @@ func sanitizedUnstructured(ctx context.Context, unstr *unstructured.Unstructured
193198
l.Info("warning: extraneous values removed from manifest metadata", "allowed metadata", allowedMetadata)
194199
}
195200
obj["metadata"] = metadataSanitized
201+
202+
// For Deployment objects, remove spec.template.metadata.annotations to avoid SSA ownership conflicts
203+
// This allows users to add annotations (like kubectl rollout restart) without OLM reverting them
204+
if unstr.GetKind() == "Deployment" && unstr.GroupVersionKind().Group == "apps" {
205+
if spec, ok := obj["spec"].(map[string]any); ok {
206+
if template, ok := spec["template"].(map[string]any); ok {
207+
if templateMeta, ok := template["metadata"].(map[string]any); ok {
208+
// Keep labels but remove annotations from pod template
209+
if _, hasAnnotations := templateMeta["annotations"]; hasAnnotations {
210+
delete(templateMeta, "annotations")
211+
l.V(1).Info("removed pod template annotations from Deployment to preserve user-managed fields",
212+
"deployment", unstr.GetName())
213+
}
214+
}
215+
}
216+
}
217+
}
196218
}
197219

198220
func (r *SimpleRevisionGenerator) buildClusterExtensionRevision(
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
Feature: Rollout Restart User Changes
2+
3+
# This test verifies that OLMv1 does not revert user-initiated changes to deployed resources,
4+
# specifically testing the scenario where a user runs `kubectl rollout restart deployment`.
5+
#
6+
# Background:
7+
# - In OLMv0, running `kubectl rollout restart deployment` would add a restart annotation
8+
# to the deployment, but OLM would revert this change because it actively reconciles
9+
# the deployment based on CSV contents.
10+
# - In OLMv1, we use Server-Side Apply which should only manage the fields that the controller
11+
# explicitly owns, allowing user-initiated changes to other fields to persist.
12+
#
13+
# This test ensures that OLMv1 handles this correctly and does not exhibit the OLMv0 behavior.
14+
# See: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
15+
16+
Background:
17+
Given OLM is available
18+
And ClusterCatalog "test" serves bundles
19+
And ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE}
20+
21+
# HISTORICAL CONTEXT:
22+
# This test validates the fix for upstream issue #3392, which affected both OLMv0 and early OLMv1.
23+
#
24+
# PROBLEM (now fixed):
25+
# When users ran `kubectl rollout restart deployment`, Boxcutter would revert the changes:
26+
# 1. kubectl adds restart annotation -> deployment controller creates new RS
27+
# 2. Boxcutter reconciles -> removes restart annotation (claimed SSA ownership)
28+
# 3. Deployment controller sees no restart needed -> scales down new RS
29+
# 4. Old RS takes over, rollout is reverted
30+
#
31+
# THE FIX:
32+
# Modified sanitizedUnstructured() in applier/boxcutter.go to remove spec.template.metadata.annotations
33+
# from Deployment objects before storing them in ClusterExtensionRevision. This prevents Server-Side
34+
# Apply from claiming ownership of pod template annotations, allowing user-managed annotations
35+
# (like kubectl.kubernetes.io/restartedAt) to persist across OLM reconciliation cycles.
36+
#
37+
# UPSTREAM ISSUE: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
38+
@BoxcutterRuntime
39+
Scenario: User-initiated deployment changes persist after OLM reconciliation
40+
When ClusterExtension is applied
41+
"""
42+
apiVersion: olm.operatorframework.io/v1
43+
kind: ClusterExtension
44+
metadata:
45+
name: ${NAME}
46+
spec:
47+
namespace: ${TEST_NAMESPACE}
48+
serviceAccount:
49+
name: olm-sa
50+
source:
51+
sourceType: Catalog
52+
catalog:
53+
packageName: test
54+
selector:
55+
matchLabels:
56+
"olm.operatorframework.io/metadata.name": test-catalog
57+
"""
58+
Then ClusterExtension is rolled out
59+
And ClusterExtension is available
60+
And resource "deployment/test-operator" is installed
61+
And deployment "test-operator" is ready
62+
63+
# Simulate user running "kubectl rollout restart deployment/test-operator"
64+
# This adds a restart annotation to trigger a rolling restart
65+
# In OLMv0, the controller would revert this annotation causing the rollout to fail
66+
# In OLMv1 with SSA, the annotation should persist because kubectl owns this field
67+
When user performs rollout restart on "deployment/test-operator"
68+
69+
# Wait for the rollout to complete - new ReplicaSet created, pods rolled out
70+
Then deployment "test-operator" rollout completes successfully
71+
And resource "deployment/test-operator" has restart annotation
72+
73+
# Wait for OLM to reconcile (controller requeues every 10s)
74+
# This is the critical test: does OLM revert the user's changes?
75+
And I wait for "30" seconds
76+
77+
# After reconciliation, verify the rollout is STILL successful
78+
# In OLMv0, this would fail because OLM reverts the annotation
79+
# causing the new RS to scale down and old RS to take over
80+
Then deployment "test-operator" rollout is still successful
81+
And resource "deployment/test-operator" has restart annotation
82+
And deployment "test-operator" has expected number of ready replicas

test/e2e/steps/steps.go

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"os/exec"
1515
"path/filepath"
1616
"reflect"
17+
"strconv"
1718
"strings"
1819
"time"
1920

@@ -87,6 +88,13 @@ func RegisterSteps(sc *godog.ScenarioContext) {
8788
sc.Step(`^(?i)resource apply fails with error msg containing "([^"]+)"$`, ResourceApplyFails)
8889
sc.Step(`^(?i)resource "([^"]+)" is eventually restored$`, ResourceRestored)
8990
sc.Step(`^(?i)resource "([^"]+)" matches$`, ResourceMatches)
91+
sc.Step(`^(?i)user performs rollout restart on "([^"]+)"$`, UserPerformsRolloutRestart)
92+
sc.Step(`^(?i)resource "([^"]+)" has restart annotation$`, ResourceHasRestartAnnotation)
93+
sc.Step(`^(?i)deployment "([^"]+)" is ready$`, DeploymentIsReady)
94+
sc.Step(`^(?i)deployment "([^"]+)" rollout completes successfully$`, DeploymentRolloutCompletesSuccessfully)
95+
sc.Step(`^(?i)I wait for "([^"]+)" seconds$`, WaitForSeconds)
96+
sc.Step(`^(?i)deployment "([^"]+)" rollout is still successful$`, DeploymentRolloutIsStillSuccessful)
97+
sc.Step(`^(?i)deployment "([^"]+)" has expected number of ready replicas$`, DeploymentHasExpectedReadyReplicas)
9098

9199
sc.Step(`^(?i)ServiceAccount "([^"]*)" with needed permissions is available in test namespace$`, ServiceAccountWithNeededPermissionsIsAvailableInNamespace)
92100
sc.Step(`^(?i)ServiceAccount "([^"]*)" with needed permissions is available in \${TEST_NAMESPACE}$`, ServiceAccountWithNeededPermissionsIsAvailableInNamespace)
@@ -1168,3 +1176,241 @@ func latestActiveRevisionForExtension(extName string) (*ocv1.ClusterExtensionRev
11681176

11691177
return latest, nil
11701178
}
1179+
1180+
// UserPerformsRolloutRestart simulates a user running `kubectl rollout restart deployment/<name>`.
1181+
// This adds a restart annotation to the deployment's pod template to trigger a rolling restart.
1182+
// In OLMv0, this annotation would be reverted by the controller. In OLMv1 with Server-Side Apply,
1183+
// it should persist because the user (kubectl) manages this field, not the controller.
1184+
// See: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
1185+
func UserPerformsRolloutRestart(ctx context.Context, resourceName string) error {
1186+
sc := scenarioCtx(ctx)
1187+
resourceName = substituteScenarioVars(resourceName, sc)
1188+
1189+
kind, deploymentName, ok := strings.Cut(resourceName, "/")
1190+
if !ok {
1191+
return fmt.Errorf("invalid resource name format: %s (expected kind/name)", resourceName)
1192+
}
1193+
1194+
if kind != "deployment" {
1195+
return fmt.Errorf("only deployment resources are supported for restart annotation, got: %s", kind)
1196+
}
1197+
1198+
// Use kubectl rollout restart to add the restart annotation
1199+
// This is the actual command users would run, ensuring we test real-world behavior
1200+
out, err := k8sClient("rollout", "restart", resourceName, "-n", sc.namespace)
1201+
if err != nil {
1202+
return fmt.Errorf("failed to rollout restart %s: %w", resourceName, err)
1203+
}
1204+
1205+
logger.V(1).Info("Rollout restart initiated", "deployment", deploymentName, "output", out)
1206+
1207+
return nil
1208+
}
1209+
1210+
// ResourceHasRestartAnnotation verifies that a deployment has a restart annotation.
1211+
// This confirms that user-initiated changes persist after OLM reconciliation.
1212+
func ResourceHasRestartAnnotation(ctx context.Context, resourceName string) error {
1213+
sc := scenarioCtx(ctx)
1214+
resourceName = substituteScenarioVars(resourceName, sc)
1215+
1216+
kind, deploymentName, ok := strings.Cut(resourceName, "/")
1217+
if !ok {
1218+
return fmt.Errorf("invalid resource name format: %s (expected kind/name)", resourceName)
1219+
}
1220+
1221+
if kind != "deployment" {
1222+
return fmt.Errorf("only deployment resources are supported for restart annotation check, got: %s", kind)
1223+
}
1224+
1225+
// Check for the restart annotation added by kubectl rollout restart
1226+
restartAnnotationKey := "kubectl.kubernetes.io/restartedAt"
1227+
1228+
// Poll for the restart annotation from the deployment's pod template
1229+
// Use waitFor for eventual consistency rather than immediate check
1230+
var annotationValue string
1231+
waitFor(ctx, func() bool {
1232+
out, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1233+
"-o", fmt.Sprintf("jsonpath={.spec.template.metadata.annotations['%s']}", restartAnnotationKey))
1234+
if err != nil {
1235+
return false
1236+
}
1237+
// If the annotation exists and has a value, it persisted
1238+
if out == "" {
1239+
return false
1240+
}
1241+
annotationValue = out
1242+
return true
1243+
})
1244+
1245+
logger.V(1).Info("Restart annotation found", "deployment", deploymentName, "restartedAt", annotationValue)
1246+
return nil
1247+
}
1248+
1249+
// DeploymentIsReady verifies that a deployment is ready with all replicas available.
1250+
func DeploymentIsReady(ctx context.Context, deploymentName string) error {
1251+
sc := scenarioCtx(ctx)
1252+
deploymentName = substituteScenarioVars(deploymentName, sc)
1253+
1254+
waitFor(ctx, func() bool {
1255+
// Check if deployment has ready replicas
1256+
out, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1257+
"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
1258+
if err != nil {
1259+
return false
1260+
}
1261+
return out == "True"
1262+
})
1263+
1264+
logger.V(1).Info("Deployment is ready", "deployment", deploymentName)
1265+
return nil
1266+
}
1267+
1268+
// DeploymentRolloutCompletesSuccessfully waits for the deployment rollout to complete.
1269+
// This verifies that a new ReplicaSet was created and pods are running.
1270+
func DeploymentRolloutCompletesSuccessfully(ctx context.Context, deploymentName string) error {
1271+
sc := scenarioCtx(ctx)
1272+
deploymentName = substituteScenarioVars(deploymentName, sc)
1273+
1274+
// Use kubectl rollout status to wait for completion
1275+
// This ensures the new ReplicaSet is created and scaled up
1276+
out, err := k8sClient("rollout", "status", "deployment/"+deploymentName, "-n", sc.namespace, "--timeout=5m")
1277+
if err != nil {
1278+
return fmt.Errorf("deployment rollout failed: %w, output: %s", err, out)
1279+
}
1280+
1281+
logger.V(1).Info("Deployment rollout completed", "deployment", deploymentName, "status", out)
1282+
1283+
// Verify deployment conditions
1284+
available, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1285+
"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
1286+
if err != nil {
1287+
return fmt.Errorf("failed to check deployment availability: %w", err)
1288+
}
1289+
if available != "True" {
1290+
return fmt.Errorf("deployment %s is not available", deploymentName)
1291+
}
1292+
1293+
progressing, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1294+
"-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].status}")
1295+
if err != nil {
1296+
return fmt.Errorf("failed to check deployment progressing: %w", err)
1297+
}
1298+
if progressing != "True" {
1299+
return fmt.Errorf("deployment %s is not progressing correctly", deploymentName)
1300+
}
1301+
1302+
return nil
1303+
}
1304+
1305+
// WaitForSeconds waits for the specified number of seconds.
1306+
// This is used to allow time for OLM reconciliation between steps.
1307+
//
1308+
// Note: This uses a deliberate time-based wait rather than polling because we need to ensure
1309+
// that OLM has had time to reconcile (controller requeues every 10s). The test validates that
1310+
// user changes persist AFTER reconciliation has had a chance to occur. A polling-based approach
1311+
// would not guarantee that reconciliation actually happened.
1312+
func WaitForSeconds(ctx context.Context, seconds string) error {
1313+
sec, err := strconv.Atoi(seconds)
1314+
if err != nil {
1315+
return fmt.Errorf("invalid seconds value %s: %w", seconds, err)
1316+
}
1317+
1318+
logger.V(1).Info("Waiting for reconciliation", "seconds", sec)
1319+
1320+
// Use select with context to make the wait cancellable
1321+
dur := time.Duration(sec) * time.Second
1322+
select {
1323+
case <-time.After(dur):
1324+
logger.V(1).Info("Wait complete")
1325+
return nil
1326+
case <-ctx.Done():
1327+
return fmt.Errorf("wait for reconciliation canceled: %w", ctx.Err())
1328+
}
1329+
}
1330+
1331+
// verifyDeploymentReplicaStatus is a helper function that checks if a deployment has the expected
1332+
// number of ready replicas matching the desired replicas.
1333+
func verifyDeploymentReplicaStatus(deploymentName, namespace string) (readyReplicas, replicas string, err error) {
1334+
readyReplicas, err = k8sClient("get", "deployment", deploymentName, "-n", namespace,
1335+
"-o", "jsonpath={.status.readyReplicas}")
1336+
if err != nil {
1337+
return "", "", fmt.Errorf("failed to get ready replicas: %w", err)
1338+
}
1339+
1340+
replicas, err = k8sClient("get", "deployment", deploymentName, "-n", namespace,
1341+
"-o", "jsonpath={.spec.replicas}")
1342+
if err != nil {
1343+
return "", "", fmt.Errorf("failed to get desired replicas: %w", err)
1344+
}
1345+
1346+
if readyReplicas != replicas {
1347+
return readyReplicas, replicas, fmt.Errorf("deployment %s has %s ready replicas but expected %s",
1348+
deploymentName, readyReplicas, replicas)
1349+
}
1350+
1351+
return readyReplicas, replicas, nil
1352+
}
1353+
1354+
// DeploymentRolloutIsStillSuccessful verifies that the deployment rollout remains successful.
1355+
// This checks that OLM reconciliation hasn't reverted the user's rollout restart.
1356+
// Specifically, it verifies that the new ReplicaSet is still active with running pods.
1357+
func DeploymentRolloutIsStillSuccessful(ctx context.Context, deploymentName string) error {
1358+
sc := scenarioCtx(ctx)
1359+
deploymentName = substituteScenarioVars(deploymentName, sc)
1360+
1361+
// Check deployment status conditions
1362+
available, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1363+
"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
1364+
if err != nil {
1365+
return fmt.Errorf("failed to check deployment availability: %w", err)
1366+
}
1367+
if available != "True" {
1368+
return fmt.Errorf("deployment %s is no longer available - rollout was reverted", deploymentName)
1369+
}
1370+
1371+
// Verify the deployment is still progressing correctly
1372+
progressing, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1373+
"-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].status}")
1374+
if err != nil {
1375+
return fmt.Errorf("failed to check deployment progressing: %w", err)
1376+
}
1377+
if progressing != "True" {
1378+
return fmt.Errorf("deployment %s is no longer progressing - rollout may have been reverted", deploymentName)
1379+
}
1380+
1381+
// Verify ready replicas match desired replicas (rollout completed and wasn't scaled down)
1382+
readyReplicas, replicas, err := verifyDeploymentReplicaStatus(deploymentName, sc.namespace)
1383+
if err != nil {
1384+
return fmt.Errorf("%w - rollout may have been reverted", err)
1385+
}
1386+
1387+
logger.V(1).Info("Deployment rollout is still successful", "deployment", deploymentName,
1388+
"readyReplicas", readyReplicas, "desiredReplicas", replicas)
1389+
1390+
return nil
1391+
}
1392+
1393+
// DeploymentHasExpectedReadyReplicas verifies that the deployment has the expected number of ready replicas.
1394+
// This ensures the rollout completed successfully and pods are running.
1395+
func DeploymentHasExpectedReadyReplicas(ctx context.Context, deploymentName string) error {
1396+
sc := scenarioCtx(ctx)
1397+
deploymentName = substituteScenarioVars(deploymentName, sc)
1398+
1399+
// Verify ready replicas match desired replicas
1400+
readyReplicas, replicas, err := verifyDeploymentReplicaStatus(deploymentName, sc.namespace)
1401+
if err != nil {
1402+
return err
1403+
}
1404+
1405+
// Additionally check for unavailable replicas
1406+
unavailableReplicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1407+
"-o", "jsonpath={.status.unavailableReplicas}")
1408+
if err == nil && unavailableReplicas != "" && unavailableReplicas != "0" {
1409+
return fmt.Errorf("deployment %s has %s unavailable replicas", deploymentName, unavailableReplicas)
1410+
}
1411+
1412+
logger.V(1).Info("Deployment has expected ready replicas", "deployment", deploymentName,
1413+
"readyReplicas", readyReplicas, "desiredReplicas", replicas)
1414+
1415+
return nil
1416+
}

0 commit comments

Comments
 (0)