Skip to content

Commit 741afe1

Browse files
Add test verifies that OLMv1 does not revert user-initiated changes to deployed resources
Generate-by: Cursor/Claude
1 parent 10c2841 commit 741afe1

File tree

2 files changed

+339
-0
lines changed

2 files changed

+339
-0
lines changed
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
Feature: Rollout Restart User Changes
2+
3+
# This test verifies that OLMv1 does not revert user-initiated changes to deployed resources,
4+
# specifically testing the scenario where a user runs `kubectl rollout restart deployment`.
5+
#
6+
# Background:
7+
# - In OLMv0, running `kubectl rollout restart deployment` would add a restart annotation
8+
# to the deployment, but OLM would revert this change because it actively reconciles
9+
# the deployment based on CSV contents.
10+
# - In OLMv1, we use Server-Side Apply which should only manage the fields that the controller
11+
# explicitly owns, allowing user-initiated changes to other fields to persist.
12+
#
13+
# This test ensures that OLMv1 handles this correctly and does not exhibit the OLMv0 behavior.
14+
# See: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
15+
16+
Background:
17+
Given OLM is available
18+
And ClusterCatalog "test" serves bundles
19+
And ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE}
20+
21+
# KNOWN BUG: Boxcutter currently reverts user-initiated deployment changes
22+
#
23+
# SYMPTOM (as described in upstream issue #3392):
24+
# When a user runs `kubectl rollout restart deployment`, a new ReplicaSet is created
25+
# but it gets immediately scaled down (1 -> 0) and the old ReplicaSet takes over.
26+
# The deployment status shows:
27+
# - OldReplicaSets: deployment-xyz (0/0 replicas)
28+
# - NewReplicaSet: deployment-abc (1/1 replicas) <- This gets scaled down
29+
# Expected behavior: New RS should stay up with running pods, old RS scaled to 0.
30+
#
31+
# ROOT CAUSE:
32+
# When Boxcutter (pkg.package-operator.run/boxcutter v0.10.0) reconciles a
33+
# ClusterExtensionRevision, it applies the manifest stored in the revision spec.
34+
# This manifest snapshot was taken when the bundle was unpacked and does NOT
35+
# include user-added annotations like the restart timestamp from `kubectl rollout restart`.
36+
#
37+
# Because Boxcutter applies this full manifest (including the annotations field),
38+
# it takes ownership of the entire field via Server-Side Apply, overwriting any
39+
# annotations added by other field managers (like kubectl). This causes:
40+
# 1. kubectl adds restart annotation -> deployment controller creates new RS
41+
# 2. Boxcutter reconciles -> removes restart annotation
42+
# 3. Deployment controller sees no restart needed -> scales down new RS
43+
# 4. Old RS takes over, rollout is reverted
44+
#
45+
# EXPECTED BEHAVIOR (not yet implemented):
46+
# The controller should only manage the fields it explicitly cares about, allowing
47+
# user-managed fields (like restart annotations) to persist across reconciliation.
48+
#
49+
# This is the same issue that existed in OLMv0 (see upstream issue below) and needs
50+
# to be fixed in the OLMv1 Boxcutter implementation.
51+
#
52+
# UPSTREAM ISSUE: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
53+
#
54+
# TODO: Fix the Boxcutter implementation to properly use Server-Side Apply field
55+
# ownership so user changes persist, then remove @skip tag
56+
@skip
57+
@BoxcutterRuntime
58+
Scenario: User-initiated deployment changes persist after OLM reconciliation
59+
When ClusterExtension is applied
60+
"""
61+
apiVersion: olm.operatorframework.io/v1
62+
kind: ClusterExtension
63+
metadata:
64+
name: ${NAME}
65+
spec:
66+
namespace: ${TEST_NAMESPACE}
67+
serviceAccount:
68+
name: olm-sa
69+
source:
70+
sourceType: Catalog
71+
catalog:
72+
packageName: test
73+
selector:
74+
matchLabels:
75+
"olm.operatorframework.io/metadata.name": test-catalog
76+
"""
77+
Then ClusterExtension is rolled out
78+
And ClusterExtension is available
79+
And resource "deployment/test-operator" is installed
80+
And deployment "test-operator" is ready
81+
82+
# Simulate user running "kubectl rollout restart deployment/test-operator"
83+
# This adds a restart annotation to trigger a rolling restart
84+
# In OLMv0, the controller would revert this annotation causing the rollout to fail
85+
# In OLMv1 with SSA, the annotation should persist because kubectl owns this field
86+
When user performs rollout restart on "deployment/test-operator"
87+
88+
# Wait for the rollout to complete - new ReplicaSet created, pods rolled out
89+
Then deployment "test-operator" rollout completes successfully
90+
And resource "deployment/test-operator" has restart annotation
91+
92+
# Wait for OLM to reconcile (controller requeues every 10s)
93+
# This is the critical test: does OLM revert the user's changes?
94+
And I wait for "30" seconds
95+
96+
# After reconciliation, verify the rollout is STILL successful
97+
# In OLMv0, this would fail because OLM reverts the annotation
98+
# causing the new RS to scale down and old RS to take over
99+
Then deployment "test-operator" rollout is still successful
100+
And resource "deployment/test-operator" has restart annotation
101+
And deployment "test-operator" has expected number of ready replicas

test/e2e/steps/steps.go

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"os/exec"
1515
"path/filepath"
1616
"reflect"
17+
"strconv"
1718
"strings"
1819
"time"
1920

@@ -87,6 +88,13 @@ func RegisterSteps(sc *godog.ScenarioContext) {
8788
sc.Step(`^(?i)resource apply fails with error msg containing "([^"]+)"$`, ResourceApplyFails)
8889
sc.Step(`^(?i)resource "([^"]+)" is eventually restored$`, ResourceRestored)
8990
sc.Step(`^(?i)resource "([^"]+)" matches$`, ResourceMatches)
91+
sc.Step(`^(?i)user performs rollout restart on "([^"]+)"$`, UserPerformsRolloutRestart)
92+
sc.Step(`^(?i)resource "([^"]+)" has restart annotation$`, ResourceHasRestartAnnotation)
93+
sc.Step(`^(?i)deployment "([^"]+)" is ready$`, DeploymentIsReady)
94+
sc.Step(`^(?i)deployment "([^"]+)" rollout completes successfully$`, DeploymentRolloutCompletesSuccessfully)
95+
sc.Step(`^(?i)I wait for "([^"]+)" seconds$`, WaitForSeconds)
96+
sc.Step(`^(?i)deployment "([^"]+)" rollout is still successful$`, DeploymentRolloutIsStillSuccessful)
97+
sc.Step(`^(?i)deployment "([^"]+)" has expected number of ready replicas$`, DeploymentHasExpectedReadyReplicas)
9098

9199
sc.Step(`^(?i)ServiceAccount "([^"]*)" with needed permissions is available in test namespace$`, ServiceAccountWithNeededPermissionsIsAvailableInNamespace)
92100
sc.Step(`^(?i)ServiceAccount "([^"]*)" with needed permissions is available in \${TEST_NAMESPACE}$`, ServiceAccountWithNeededPermissionsIsAvailableInNamespace)
@@ -1168,3 +1176,233 @@ func latestActiveRevisionForExtension(extName string) (*ocv1.ClusterExtensionRev
11681176

11691177
return latest, nil
11701178
}
1179+
1180+
// UserPerformsRolloutRestart simulates a user running `kubectl rollout restart deployment/<name>`.
1181+
// This adds a restart annotation to the deployment's pod template to trigger a rolling restart.
1182+
// In OLMv0, this annotation would be reverted by the controller. In OLMv1 with Server-Side Apply,
1183+
// it should persist because the user (kubectl) manages this field, not the controller.
1184+
// See: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
1185+
func UserPerformsRolloutRestart(ctx context.Context, resourceName string) error {
1186+
sc := scenarioCtx(ctx)
1187+
resourceName = substituteScenarioVars(resourceName, sc)
1188+
1189+
kind, deploymentName, ok := strings.Cut(resourceName, "/")
1190+
if !ok {
1191+
return fmt.Errorf("invalid resource name format: %s (expected kind/name)", resourceName)
1192+
}
1193+
1194+
if kind != "deployment" {
1195+
return fmt.Errorf("only deployment resources are supported for restart annotation, got: %s", kind)
1196+
}
1197+
1198+
// Capture current generation before restart
1199+
beforeGen, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1200+
"-o", "jsonpath={.metadata.generation}")
1201+
if err != nil {
1202+
return fmt.Errorf("failed to get deployment generation before restart: %w", err)
1203+
}
1204+
1205+
logger.V(1).Info("Deployment state before rollout restart", "deployment", deploymentName, "generation", beforeGen)
1206+
1207+
// Use kubectl rollout restart to add the restart annotation
1208+
// This is the actual command users would run, ensuring we test real-world behavior
1209+
out, err := k8sClient("rollout", "restart", resourceName, "-n", sc.namespace)
1210+
if err != nil {
1211+
return fmt.Errorf("failed to rollout restart %s: %w", resourceName, err)
1212+
}
1213+
1214+
logger.V(1).Info("Rollout restart initiated", "deployment", deploymentName, "output", out)
1215+
1216+
return nil
1217+
}
1218+
1219+
// ResourceHasRestartAnnotation verifies that a deployment has a restart annotation.
1220+
// This confirms that user-initiated changes persist after OLM reconciliation.
1221+
func ResourceHasRestartAnnotation(ctx context.Context, resourceName string) error {
1222+
sc := scenarioCtx(ctx)
1223+
resourceName = substituteScenarioVars(resourceName, sc)
1224+
1225+
kind, deploymentName, ok := strings.Cut(resourceName, "/")
1226+
if !ok {
1227+
return fmt.Errorf("invalid resource name format: %s (expected kind/name)", resourceName)
1228+
}
1229+
1230+
if kind != "deployment" {
1231+
return fmt.Errorf("only deployment resources are supported for restart annotation check, got: %s", kind)
1232+
}
1233+
1234+
// Check for the restart annotation added by kubectl rollout restart
1235+
restartAnnotationKey := "kubectl.kubernetes.io/restartedAt"
1236+
1237+
// Get the restart annotation from the deployment's pod template
1238+
out, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1239+
"-o", fmt.Sprintf("jsonpath={.spec.template.metadata.annotations['%s']}", restartAnnotationKey))
1240+
if err != nil {
1241+
return fmt.Errorf("failed to get restart annotation: %w", err)
1242+
}
1243+
1244+
// If the annotation exists and has a value, it persisted
1245+
if out == "" {
1246+
return fmt.Errorf("restart annotation not found on deployment %s", deploymentName)
1247+
}
1248+
1249+
logger.V(1).Info("Restart annotation found", "deployment", deploymentName, "restartedAt", out)
1250+
return nil
1251+
}
1252+
1253+
// DeploymentIsReady verifies that a deployment is ready with all replicas available.
1254+
func DeploymentIsReady(ctx context.Context, deploymentName string) error {
1255+
sc := scenarioCtx(ctx)
1256+
deploymentName = substituteScenarioVars(deploymentName, sc)
1257+
1258+
waitFor(ctx, func() bool {
1259+
// Check if deployment has ready replicas
1260+
out, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1261+
"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
1262+
if err != nil {
1263+
return false
1264+
}
1265+
return out == "True"
1266+
})
1267+
1268+
logger.V(1).Info("Deployment is ready", "deployment", deploymentName)
1269+
return nil
1270+
}
1271+
1272+
// DeploymentRolloutCompletesSuccessfully waits for the deployment rollout to complete.
1273+
// This verifies that a new ReplicaSet was created and pods are running.
1274+
func DeploymentRolloutCompletesSuccessfully(ctx context.Context, deploymentName string) error {
1275+
sc := scenarioCtx(ctx)
1276+
deploymentName = substituteScenarioVars(deploymentName, sc)
1277+
1278+
// Use kubectl rollout status to wait for completion
1279+
// This ensures the new ReplicaSet is created and scaled up
1280+
out, err := k8sClient("rollout", "status", "deployment/"+deploymentName, "-n", sc.namespace, "--timeout=5m")
1281+
if err != nil {
1282+
return fmt.Errorf("deployment rollout failed: %w, output: %s", err, out)
1283+
}
1284+
1285+
logger.V(1).Info("Deployment rollout completed", "deployment", deploymentName, "status", out)
1286+
1287+
// Verify deployment conditions
1288+
available, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1289+
"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
1290+
if err != nil {
1291+
return fmt.Errorf("failed to check deployment availability: %w", err)
1292+
}
1293+
if available != "True" {
1294+
return fmt.Errorf("deployment %s is not available", deploymentName)
1295+
}
1296+
1297+
progressing, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1298+
"-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].status}")
1299+
if err != nil {
1300+
return fmt.Errorf("failed to check deployment progressing: %w", err)
1301+
}
1302+
if progressing != "True" {
1303+
return fmt.Errorf("deployment %s is not progressing correctly", deploymentName)
1304+
}
1305+
1306+
return nil
1307+
}
1308+
1309+
// WaitForSeconds waits for the specified number of seconds.
1310+
// This is used to allow time for OLM reconciliation between steps.
1311+
func WaitForSeconds(ctx context.Context, seconds string) error {
1312+
sec, err := strconv.Atoi(seconds)
1313+
if err != nil {
1314+
return fmt.Errorf("invalid seconds value %s: %w", seconds, err)
1315+
}
1316+
1317+
logger.V(1).Info("Waiting for reconciliation", "seconds", sec)
1318+
time.Sleep(time.Duration(sec) * time.Second)
1319+
logger.V(1).Info("Wait complete")
1320+
1321+
return nil
1322+
}
1323+
1324+
// DeploymentRolloutIsStillSuccessful verifies that the deployment rollout remains successful.
1325+
// This checks that OLM reconciliation hasn't reverted the user's rollout restart.
1326+
// Specifically, it verifies that the new ReplicaSet is still active with running pods.
1327+
func DeploymentRolloutIsStillSuccessful(ctx context.Context, deploymentName string) error {
1328+
sc := scenarioCtx(ctx)
1329+
deploymentName = substituteScenarioVars(deploymentName, sc)
1330+
1331+
// Check deployment status conditions
1332+
available, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1333+
"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
1334+
if err != nil {
1335+
return fmt.Errorf("failed to check deployment availability: %w", err)
1336+
}
1337+
if available != "True" {
1338+
return fmt.Errorf("deployment %s is no longer available - rollout was reverted", deploymentName)
1339+
}
1340+
1341+
// Verify the deployment is still progressing correctly
1342+
progressing, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1343+
"-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].status}")
1344+
if err != nil {
1345+
return fmt.Errorf("failed to check deployment progressing: %w", err)
1346+
}
1347+
if progressing != "True" {
1348+
return fmt.Errorf("deployment %s is no longer progressing - rollout may have been reverted", deploymentName)
1349+
}
1350+
1351+
// Verify ready replicas match desired replicas (rollout completed and wasn't scaled down)
1352+
readyReplicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1353+
"-o", "jsonpath={.status.readyReplicas}")
1354+
if err != nil {
1355+
return fmt.Errorf("failed to get ready replicas: %w", err)
1356+
}
1357+
1358+
replicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1359+
"-o", "jsonpath={.spec.replicas}")
1360+
if err != nil {
1361+
return fmt.Errorf("failed to get desired replicas: %w", err)
1362+
}
1363+
1364+
if readyReplicas != replicas {
1365+
return fmt.Errorf("deployment %s has %s ready replicas but expected %s - rollout may have been reverted",
1366+
deploymentName, readyReplicas, replicas)
1367+
}
1368+
1369+
logger.V(1).Info("Deployment rollout is still successful", "deployment", deploymentName,
1370+
"readyReplicas", readyReplicas, "desiredReplicas", replicas)
1371+
1372+
return nil
1373+
}
1374+
1375+
// DeploymentHasExpectedReadyReplicas verifies that the deployment has the expected number of ready replicas.
1376+
// This ensures the rollout completed successfully and pods are running.
1377+
func DeploymentHasExpectedReadyReplicas(ctx context.Context, deploymentName string) error {
1378+
sc := scenarioCtx(ctx)
1379+
deploymentName = substituteScenarioVars(deploymentName, sc)
1380+
1381+
readyReplicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1382+
"-o", "jsonpath={.status.readyReplicas}")
1383+
if err != nil {
1384+
return fmt.Errorf("failed to get ready replicas: %w", err)
1385+
}
1386+
1387+
replicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1388+
"-o", "jsonpath={.spec.replicas}")
1389+
if err != nil {
1390+
return fmt.Errorf("failed to get desired replicas: %w", err)
1391+
}
1392+
1393+
if readyReplicas != replicas {
1394+
return fmt.Errorf("deployment %s has %s ready replicas but expected %s",
1395+
deploymentName, readyReplicas, replicas)
1396+
}
1397+
1398+
unavailableReplicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1399+
"-o", "jsonpath={.status.unavailableReplicas}")
1400+
if err == nil && unavailableReplicas != "" && unavailableReplicas != "0" {
1401+
return fmt.Errorf("deployment %s has %s unavailable replicas", deploymentName, unavailableReplicas)
1402+
}
1403+
1404+
logger.V(1).Info("Deployment has expected ready replicas", "deployment", deploymentName,
1405+
"readyReplicas", readyReplicas, "desiredReplicas", replicas)
1406+
1407+
return nil
1408+
}

0 commit comments

Comments
 (0)