From 7b22ba9f234c8d61c854c23d7c1169e3632cab98 Mon Sep 17 00:00:00 2001 From: Dan Prince Date: Fri, 19 Dec 2025 13:43:04 -0500 Subject: [PATCH] Force reinstall of operator resources on release version upgrade Fixes upgrade failures from 0.4 to main caused by incompatible webhook configuration changes that trigger index out of range panics during manifest merging. When OPENSTACK_RELEASE_VERSION is bumped, the controller now: - Detects the version change by comparing against status.ReleaseVersion - Deletes all owned resources (deployments, services, serviceaccounts, configmaps) - Removes managed webhooks (validating and mutating configurations) - Requeues to recreate resources with new manifests This one-time cleanup ensures a clean slate for incompatible upgrades where the structure of resources (especially webhooks) has changed between versions. Adds ReleaseVersion field to OpenStackStatus to track the deployed version. Jira: OSPRH-23865 --- .../operator.openstack.org_openstacks.yaml | 2 + api/operator/v1beta1/openstack_types.go | 3 + api/operator/v1beta1/zz_generated.deepcopy.go | 5 + .../operator.openstack.org_openstacks.yaml | 2 + .../operator/openstack_controller.go | 118 ++++++++++++++++++ 5 files changed, 130 insertions(+) diff --git a/api/bases/operator.openstack.org_openstacks.yaml b/api/bases/operator.openstack.org_openstacks.yaml index 2009dd660..f9957b774 100644 --- a/api/bases/operator.openstack.org_openstacks.yaml +++ b/api/bases/operator.openstack.org_openstacks.yaml @@ -164,6 +164,8 @@ spec: observedGeneration: format: int64 type: integer + releaseVersion: + type: string totalOperatorCount: type: integer type: object diff --git a/api/operator/v1beta1/openstack_types.go b/api/operator/v1beta1/openstack_types.go index b8833a8b5..1918bd2a3 100644 --- a/api/operator/v1beta1/openstack_types.go +++ b/api/operator/v1beta1/openstack_types.go @@ -256,6 +256,9 @@ type OpenStackStatus struct { // ContainerImage - the container image that has been successfully deployed ContainerImage *string `json:"containerImage,omitempty"` + + // ReleaseVersion - the OpenStack release version that has been successfully deployed + ReleaseVersion *string `json:"releaseVersion,omitempty"` } // +kubebuilder:object:root=true diff --git a/api/operator/v1beta1/zz_generated.deepcopy.go b/api/operator/v1beta1/zz_generated.deepcopy.go index 64cc6b57c..601347ecf 100644 --- a/api/operator/v1beta1/zz_generated.deepcopy.go +++ b/api/operator/v1beta1/zz_generated.deepcopy.go @@ -158,6 +158,11 @@ func (in *OpenStackStatus) DeepCopyInto(out *OpenStackStatus) { *out = new(string) **out = **in } + if in.ReleaseVersion != nil { + in, out := &in.ReleaseVersion, &out.ReleaseVersion + *out = new(string) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OpenStackStatus. diff --git a/config/crd/bases/operator.openstack.org_openstacks.yaml b/config/crd/bases/operator.openstack.org_openstacks.yaml index 2009dd660..f9957b774 100644 --- a/config/crd/bases/operator.openstack.org_openstacks.yaml +++ b/config/crd/bases/operator.openstack.org_openstacks.yaml @@ -164,6 +164,8 @@ spec: observedGeneration: format: int64 type: integer + releaseVersion: + type: string totalOperatorCount: type: integer type: object diff --git a/internal/controller/operator/openstack_controller.go b/internal/controller/operator/openstack_controller.go index 52708237a..72f7e39c8 100644 --- a/internal/controller/operator/openstack_controller.go +++ b/internal/controller/operator/openstack_controller.go @@ -250,6 +250,39 @@ func (r *OpenStackReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, err } + // Check if OPENSTACK_RELEASE_VERSION has changed - if so, delete all owned resources + // This is a one-time fix to handle incompatible upgrades + if instance.Status.ReleaseVersion != nil && *instance.Status.ReleaseVersion != openstackReleaseVersion { + Log.Info("OpenStack release version changed, deleting all owned resources", + "old", *instance.Status.ReleaseVersion, + "new", openstackReleaseVersion) + + if err := r.deleteAllOwnedResources(ctx, instance); err != nil { + instance.Status.Conditions.Set(condition.FalseCondition( + operatorv1beta1.OpenStackOperatorReadyCondition, + condition.ErrorReason, + condition.SeverityWarning, + operatorv1beta1.OpenStackOperatorErrorMessage, + err)) + return ctrl.Result{}, err + } + + // Reset the container image status to force re-application of CRDs and RBAC + instance.Status.ContainerImage = nil + + // Update the release version in status + instance.Status.ReleaseVersion = &openstackReleaseVersion + + // Requeue to allow resources to be deleted before recreating + Log.Info("Resources deleted, requeuing to recreate with new version") + return ctrl.Result{RequeueAfter: time.Duration(5) * time.Second}, nil + } + + // Set the release version if not set + if instance.Status.ReleaseVersion == nil { + instance.Status.ReleaseVersion = &openstackReleaseVersion + } + if err := r.applyManifests(ctx, instance); err != nil { instance.Status.Conditions.Set(condition.FalseCondition( operatorv1beta1.OpenStackOperatorReadyCondition, @@ -316,6 +349,91 @@ func (r *OpenStackReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } +func (r *OpenStackReconciler) deleteAllOwnedResources(ctx context.Context, instance *operatorv1beta1.OpenStack) error { + Log := r.GetLogger(ctx) + Log.Info("Deleting all owned resources for release version upgrade") + + // Delete all owned deployments + deployments := &appsv1.DeploymentList{} + err := r.List(ctx, deployments, &client.ListOptions{Namespace: instance.Namespace}) + if err != nil { + return errors.Wrap(err, "failed to list deployments") + } + for _, deployment := range deployments.Items { + if metav1.IsControlledBy(&deployment, instance) { + Log.Info("Deleting deployment", "name", deployment.Name) + err := r.Delete(ctx, &deployment) + if err != nil && !apierrors.IsNotFound(err) { + return errors.Wrapf(err, "failed to delete deployment %s", deployment.Name) + } + } + } + + // Delete all owned service accounts + serviceAccounts := &corev1.ServiceAccountList{} + err = r.List(ctx, serviceAccounts, &client.ListOptions{Namespace: instance.Namespace}) + if err != nil { + return errors.Wrap(err, "failed to list service accounts") + } + for _, sa := range serviceAccounts.Items { + if metav1.IsControlledBy(&sa, instance) { + Log.Info("Deleting service account", "name", sa.Name) + err := r.Delete(ctx, &sa) + if err != nil && !apierrors.IsNotFound(err) { + return errors.Wrapf(err, "failed to delete service account %s", sa.Name) + } + } + } + + // Delete all owned services + services := &corev1.ServiceList{} + err = r.List(ctx, services, &client.ListOptions{Namespace: instance.Namespace}) + if err != nil { + return errors.Wrap(err, "failed to list services") + } + for _, svc := range services.Items { + if metav1.IsControlledBy(&svc, instance) { + Log.Info("Deleting service", "name", svc.Name) + err := r.Delete(ctx, &svc) + if err != nil && !apierrors.IsNotFound(err) { + return errors.Wrapf(err, "failed to delete service %s", svc.Name) + } + } + } + + // Delete webhooks (these are cluster-scoped and not owned, but managed by label) + valWebhooks, err := r.Kclient.AdmissionregistrationV1().ValidatingWebhookConfigurations().List(ctx, metav1.ListOptions{ + LabelSelector: "openstack.openstack.org/managed=true", + }) + if err != nil { + return errors.Wrap(err, "failed listing validating webhook configurations") + } + for _, webhook := range valWebhooks.Items { + Log.Info("Deleting validating webhook", "name", webhook.Name) + err := r.Kclient.AdmissionregistrationV1().ValidatingWebhookConfigurations().Delete(ctx, webhook.Name, metav1.DeleteOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + return errors.Wrapf(err, "failed to delete validating webhook %s", webhook.Name) + } + } + + mutWebhooks, err := r.Kclient.AdmissionregistrationV1().MutatingWebhookConfigurations().List(ctx, metav1.ListOptions{ + LabelSelector: "openstack.openstack.org/managed=true", + }) + if err != nil { + return errors.Wrap(err, "failed listing mutating webhook configurations") + } + for _, webhook := range mutWebhooks.Items { + Log.Info("Deleting mutating webhook", "name", webhook.Name) + err := r.Kclient.AdmissionregistrationV1().MutatingWebhookConfigurations().Delete(ctx, webhook.Name, metav1.DeleteOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + return errors.Wrapf(err, "failed to delete mutating webhook %s", webhook.Name) + } + } + + Log.Info("All owned resources deleted successfully") + return nil +} + func (r *OpenStackReconciler) reconcileDelete(ctx context.Context, instance *operatorv1beta1.OpenStack, helper *helper.Helper) (ctrl.Result, error) { Log := r.GetLogger(ctx) Log.Info("Reconciling OpenStack initialization resource delete")