diff --git a/api/v1alpha1/constants.go b/api/v1alpha1/constants.go index 796f1c07..e26f4246 100644 --- a/api/v1alpha1/constants.go +++ b/api/v1alpha1/constants.go @@ -24,6 +24,9 @@ const ( OperationAnnotationForceUpdateOrDeleteInProgress = "ForceUpdateOrDeleteInProgress" // OperationAnnotationForceUpdateInProgress allows update of a resource even if it is in progress. OperationAnnotationForceUpdateInProgress = "ForceUpdateInProgress" + + // SkipBootConfiguration skips the boot configuration if set to this value. + SkipBootConfiguration = "skip-boot-configuration-enforement" // OperationAnnotationForceReset forces a reset before next operation OperationAnnotationForceReset = "ForceReset" ) diff --git a/api/v1alpha1/server_types.go b/api/v1alpha1/server_types.go index 4f06d88a..ddf13c5f 100644 --- a/api/v1alpha1/server_types.go +++ b/api/v1alpha1/server_types.go @@ -130,6 +130,7 @@ type ServerSpec struct { MaintenanceBootConfigurationRef *v1.ObjectReference `json:"maintenanceBootConfigurationRef,omitempty"` // BootOrder specifies the boot order of the server. + // Deprecated: currently not supported. // +optional BootOrder []BootOrder `json:"bootOrder,omitempty"` diff --git a/api/v1alpha1/servermaintenance_types.go b/api/v1alpha1/servermaintenance_types.go index 06b05d91..df094062 100644 --- a/api/v1alpha1/servermaintenance_types.go +++ b/api/v1alpha1/servermaintenance_types.go @@ -23,11 +23,33 @@ type ServerBootConfigurationTemplate struct { // +required Name string `json:"name"` + // BootType specifies the type of boot configuration. + // It can be either "OneOff" or "Persistent". If not specified, it defaults to "Persistent". + // "Persistent" will set the configuration reboot the server before setting the server to the desired power state mentioned in spec. + // + // "OneOff" will set the configuration to boot once. next boot-up will boot into default boot order. + // if power state mentioned in spec is "PowerOn", server will be powered on and will boot into the configuration. + // if power state is "PowerOff", server will be powered off and will boot into the configuration on next power on. + // + // +kubebuilder:validation:Enum=OneOff;Persistent + // +optional + // +kubebuilder:default=Persistent + BootType BootType `json:"bootType,omitempty"` + // Parameters specify the parameters to be used for rendering the boot configuration. // +required Spec ServerBootConfigurationSpec `json:"spec"` } +type BootType string + +const ( + // BootTypeOneOff indicates that the server should boot into this configuration one time. + BootTypeOneOff BootType = "OneOff" + // BootTypePersistent indicates that the server should boot into this configuration continuously. + BootTypePersistent BootType = "Persistent" +) + // ServerMaintenanceSpec defines the desired state of a ServerMaintenance type ServerMaintenanceSpec struct { // Policy specifies the maintenance policy to be enforced on the server. @@ -61,8 +83,46 @@ const ( type ServerMaintenanceStatus struct { // State specifies the current state of the server maintenance. State ServerMaintenanceState `json:"state,omitempty"` + + // BootOrderStatus specifies the boot order of the server before maintenance. + // is used to store the boot order of server before it is changed for maintenance. + // used only with BootType Persistent. + // +optional + BootOrderStatus *ServerMaintenanceBootOrder `json:"bootOrderStatus,omitempty"` +} + +type ServerMaintenanceBootOrder struct { + // DefaultBootOrder specifies the default boot order of the server before maintenance. + // +optional + DefaultBootOrder []string `json:"defaultBootOrder,omitempty"` + + State ServerMaintenanceBootOrderState `json:"state,omitempty"` } +// ServerMaintenanceState specifies the current state of the server maintenance. +type ServerMaintenanceBootOrderState string + +const ( + + // BootOrderConfigNoOp specifies that the server bootOrder is as expected. + BootOrderConfigNoOp ServerMaintenanceBootOrderState = "NoOp" + + // BootOrderConfigInProgress specifies that the server bootOrder configuration is InProgress. + BootOrderConfigInProgress ServerMaintenanceBootOrderState = "InProgress" + + // BootOrderConfigSuccess specifies that the server bootOrder configuration is Completed. + BootOrderConfigSuccess ServerMaintenanceBootOrderState = "Completed" + + // BootOrderConfigSuccessRevertInProgress specifies that the server bootOrder revert to default is InProgress. + BootOrderConfigSuccessRevertInProgress ServerMaintenanceBootOrderState = "RevertInProgress" + + // BootOrderConfigRevertSuccess specifies that the server bootOrder configuration is Completed. + BootOrderConfigRevertSuccess ServerMaintenanceBootOrderState = "RevertCompleted" + + // BootOrderOneOffPxeBootSuccess specifies that the server bootOrder configuration is set to boot pxe once. + BootOrderOneOffPxeBootSuccess ServerMaintenanceBootOrderState = "PxeOneOffBootSuccess" +) + // ServerMaintenanceState specifies the current state of the server maintenance. type ServerMaintenanceState string @@ -71,6 +131,8 @@ const ( ServerMaintenanceStatePending ServerMaintenanceState = "Pending" // ServerMaintenanceStateInMaintenance specifies that the server is in maintenance. ServerMaintenanceStateInMaintenance ServerMaintenanceState = "InMaintenance" + // ServerMaintenanceStatePreparing specifies that the server is in maintenance. + ServerMaintenanceStatePreparing ServerMaintenanceState = "Preparing" // ServerMaintenanceStateFailed specifies that the server maintenance has failed. ServerMaintenanceStateFailed ServerMaintenanceState = "Failed" ) diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index cd44503e..09d61e41 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1535,7 +1535,7 @@ func (in *ServerMaintenance) DeepCopyInto(out *ServerMaintenance) { out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) in.Spec.DeepCopyInto(&out.Spec) - out.Status = in.Status + in.Status.DeepCopyInto(&out.Status) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServerMaintenance. @@ -1556,6 +1556,26 @@ func (in *ServerMaintenance) DeepCopyObject() runtime.Object { return nil } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServerMaintenanceBootOrder) DeepCopyInto(out *ServerMaintenanceBootOrder) { + *out = *in + if in.DefaultBootOrder != nil { + in, out := &in.DefaultBootOrder, &out.DefaultBootOrder + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServerMaintenanceBootOrder. +func (in *ServerMaintenanceBootOrder) DeepCopy() *ServerMaintenanceBootOrder { + if in == nil { + return nil + } + out := new(ServerMaintenanceBootOrder) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ServerMaintenanceList) DeepCopyInto(out *ServerMaintenanceList) { *out = *in @@ -1636,6 +1656,11 @@ func (in *ServerMaintenanceSpec) DeepCopy() *ServerMaintenanceSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ServerMaintenanceStatus) DeepCopyInto(out *ServerMaintenanceStatus) { *out = *in + if in.BootOrderStatus != nil { + in, out := &in.BootOrderStatus, &out.BootOrderStatus + *out = new(ServerMaintenanceBootOrder) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServerMaintenanceStatus. diff --git a/bmc/bmc.go b/bmc/bmc.go index e7bd1cd0..b602d036 100644 --- a/bmc/bmc.go +++ b/bmc/bmc.go @@ -68,6 +68,9 @@ type BMC interface { // GetBootOrder retrieves the boot order for the system. GetBootOrder(ctx context.Context, systemURI string) ([]string, error) + // GetBootOptions retrieves the boot options for the system. + GetBootOptions(ctx context.Context, systemURI string) ([]*redfish.BootOption, error) + // GetBiosAttributeValues retrieves BIOS attribute values for the system. GetBiosAttributeValues(ctx context.Context, systemURI string, attributes []string) (redfish.SettingsAttributes, error) @@ -99,7 +102,7 @@ type BMC interface { GetBMCVersion(ctx context.Context, UUID string) (string, error) // SetBootOrder sets the boot order for the system. - SetBootOrder(ctx context.Context, systemURI string, order []string) error + SetBootOrder(ctx context.Context, systemURI string, order redfish.Boot) error // GetStorages retrieves storage information for the system. GetStorages(ctx context.Context, systemURI string) ([]Storage, error) diff --git a/bmc/redfish.go b/bmc/redfish.go index 37699ec6..ac8ecb25 100644 --- a/bmc/redfish.go +++ b/bmc/redfish.go @@ -320,6 +320,14 @@ func (r *RedfishBMC) GetBootOrder(ctx context.Context, systemURI string) ([]stri return system.Boot.BootOrder, nil } +func (r *RedfishBMC) GetBootOptions(ctx context.Context, systemURI string) ([]*redfish.BootOption, error) { + system, err := r.getSystemFromUri(ctx, systemURI) + if err != nil { + return nil, err + } + return system.BootOptions() +} + func (r *RedfishBMC) GetBiosVersion(ctx context.Context, systemURI string) (string, error) { system, err := r.getSystemFromUri(ctx, systemURI) if err != nil { @@ -474,21 +482,30 @@ func (r *RedfishBMC) SetBMCAttributesImmediately(ctx context.Context, bmcUUID st } // SetBootOrder sets bios boot order -func (r *RedfishBMC) SetBootOrder(ctx context.Context, systemURI string, bootOrder []string) error { +func (r *RedfishBMC) SetBootOrder(ctx context.Context, systemURI string, bootOrder redfish.Boot) error { system, err := r.getSystemFromUri(ctx, systemURI) if err != nil { return err } - return system.SetBoot( - redfish.Boot{ - BootSourceOverrideEnabled: redfish.ContinuousBootSourceOverrideEnabled, - BootSourceOverrideTarget: redfish.NoneBootSourceOverrideTarget, - BootOrder: bootOrder, - }, - ) + var tSystem struct { + Settings common.Settings `json:"@Redfish.Settings"` + } + // Unfortunately, some vendors (like Dell) support setting through different url. + // Hence we need to set it through this url. + err = json.Unmarshal(system.RawData, &tSystem) + if err != nil { + return err + } + if tSystem.Settings.SettingsObject.String() == "" { + return system.SetBoot(bootOrder) + } + return system.SetBoot(bootOrder) } -func (r *RedfishBMC) getFilteredBiosRegistryAttributes(readOnly bool, immutable bool) (map[string]RegistryEntryAttributes, error) { +func (r *RedfishBMC) getFilteredBiosRegistryAttributes( + readOnly bool, + immutable bool, +) (map[string]RegistryEntryAttributes, error) { registries, err := r.client.Service.Registries() if err != nil { return nil, err diff --git a/cmd/manager/main.go b/cmd/manager/main.go index 0dfba887..70ccdf31 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -366,6 +366,15 @@ func main() { // nolint: gocyclo if err = (&controller.ServerMaintenanceReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), + BMCOptions: bmc.Options{ + BasicAuth: true, + PowerPollingInterval: powerPollingInterval, + PowerPollingTimeout: powerPollingTimeout, + ResourcePollingInterval: resourcePollingInterval, + ResourcePollingTimeout: resourcePollingTimeout, + }, + Insecure: insecure, + ResyncInterval: serverResyncInterval, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "ServerMaintenance") os.Exit(1) diff --git a/config/crd/bases/metal.ironcore.dev_servermaintenances.yaml b/config/crd/bases/metal.ironcore.dev_servermaintenances.yaml index 730c1943..33051382 100644 --- a/config/crd/bases/metal.ironcore.dev_servermaintenances.yaml +++ b/config/crd/bases/metal.ironcore.dev_servermaintenances.yaml @@ -66,6 +66,20 @@ spec: description: ServerBootConfigurationTemplate specifies the boot configuration to be applied to the server during maintenance. properties: + bootType: + default: Persistent + description: |- + BootType specifies the type of boot configuration. + It can be either "OneOff" or "Persistent". If not specified, it defaults to "Persistent". + "Persistent" will set the configuration reboot the server before setting the server to the desired power state mentioned in spec. + + "OneOff" will set the configuration to boot once. next boot-up will boot into default boot order. + if power state mentioned in spec is "PowerOn", server will be powered on and will boot into the configuration. + if power state is "PowerOff", server will be powered off and will boot into the configuration on next power on. + enum: + - OneOff + - Persistent + type: string name: description: Name specifies the name of the boot configuration. type: string @@ -141,6 +155,23 @@ spec: status: description: ServerMaintenanceStatus defines the observed state of a ServerMaintenance properties: + bootOrderStatus: + description: |- + BootOrderStatus specifies the boot order of the server before maintenance. + is used to store the boot order of server before it is changed for maintenance. + used only with BootType Persistent. + properties: + defaultBootOrder: + description: DefaultBootOrder specifies the default boot order + of the server before maintenance. + items: + type: string + type: array + state: + description: ServerMaintenanceState specifies the current state + of the server maintenance. + type: string + type: object state: description: State specifies the current state of the server maintenance. type: string diff --git a/config/crd/bases/metal.ironcore.dev_servers.yaml b/config/crd/bases/metal.ironcore.dev_servers.yaml index dd699246..5bb78870 100644 --- a/config/crd/bases/metal.ironcore.dev_servers.yaml +++ b/config/crd/bases/metal.ironcore.dev_servers.yaml @@ -200,7 +200,9 @@ spec: type: object x-kubernetes-map-type: atomic bootOrder: - description: BootOrder specifies the boot order of the server. + description: |- + BootOrder specifies the boot order of the server. + Deprecated: currently not supported. items: description: BootOrder represents the boot order of the server. properties: diff --git a/config/samples/metal_v1alpha1_servermaintenance.yaml b/config/samples/metal_v1alpha1_servermaintenance.yaml index 5e440cc1..f19a3b12 100644 --- a/config/samples/metal_v1alpha1_servermaintenance.yaml +++ b/config/samples/metal_v1alpha1_servermaintenance.yaml @@ -5,5 +5,17 @@ metadata: app.kubernetes.io/name: metal-operator app.kubernetes.io/managed-by: kustomize name: servermaintenance-sample + namespace: default spec: - # TODO(user): Add fields here + serverRef: + name: node005-ap046-system-0 + policy: metalv1alpha1.ServerMaintenancePolicyEnforced + serverPower: metalv1alpha1.PowerOn + serverBootConfigurationTemplate: + name: "test-boot" + spec: + ignitionSecretRef: + name: ignition-secret + serverRef: + name: server-system-0 + image: "ghcr.io//foo/linux:C.X" diff --git a/dist/chart/templates/crd/metal.ironcore.dev_servermaintenances.yaml b/dist/chart/templates/crd/metal.ironcore.dev_servermaintenances.yaml index fa0d9b8b..11766888 100755 --- a/dist/chart/templates/crd/metal.ironcore.dev_servermaintenances.yaml +++ b/dist/chart/templates/crd/metal.ironcore.dev_servermaintenances.yaml @@ -72,6 +72,20 @@ spec: description: ServerBootConfigurationTemplate specifies the boot configuration to be applied to the server during maintenance. properties: + bootType: + default: Persistent + description: |- + BootType specifies the type of boot configuration. + It can be either "OneOff" or "Persistent". If not specified, it defaults to "Persistent". + "Persistent" will set the configuration reboot the server before setting the server to the desired power state mentioned in spec. + + "OneOff" will set the configuration to boot once. next boot-up will boot into default boot order. + if power state mentioned in spec is "PowerOn", server will be powered on and will boot into the configuration. + if power state is "PowerOff", server will be powered off and will boot into the configuration on next power on. + enum: + - OneOff + - Persistent + type: string name: description: Name specifies the name of the boot configuration. type: string @@ -147,6 +161,23 @@ spec: status: description: ServerMaintenanceStatus defines the observed state of a ServerMaintenance properties: + bootOrderStatus: + description: |- + BootOrderStatus specifies the boot order of the server before maintenance. + is used to store the boot order of server before it is changed for maintenance. + used only with BootType Persistent. + properties: + defaultBootOrder: + description: DefaultBootOrder specifies the default boot order + of the server before maintenance. + items: + type: string + type: array + state: + description: ServerMaintenanceState specifies the current state + of the server maintenance. + type: string + type: object state: description: State specifies the current state of the server maintenance. type: string diff --git a/dist/chart/templates/crd/metal.ironcore.dev_servers.yaml b/dist/chart/templates/crd/metal.ironcore.dev_servers.yaml index 9e84b616..00d0cf46 100755 --- a/dist/chart/templates/crd/metal.ironcore.dev_servers.yaml +++ b/dist/chart/templates/crd/metal.ironcore.dev_servers.yaml @@ -206,7 +206,9 @@ spec: type: object x-kubernetes-map-type: atomic bootOrder: - description: BootOrder specifies the boot order of the server. + description: |- + BootOrder specifies the boot order of the server. + Deprecated: currently not supported. items: description: BootOrder represents the boot order of the server. properties: diff --git a/docs/api-reference/api.md b/docs/api-reference/api.md index ac1fe38b..1284f4dc 100644 --- a/docs/api-reference/api.md +++ b/docs/api-reference/api.md @@ -2628,6 +2628,28 @@ string +

BootType +(string alias)

+

+(Appears on:ServerBootConfigurationTemplate) +

+
+
+ + + + + + + + + + + + +
ValueDescription

"OneOff"

BootTypeOneOff indicates that the server should boot into this configuration one time.

+

"Persistent"

BootTypePersistent indicates that the server should boot into this configuration continuously.

+

ConsoleProtocol

@@ -3554,7 +3576,8 @@ the boot configuration for this server during maintenance. This field is optiona (Optional) -

BootOrder specifies the boot order of the server.

+

BootOrder specifies the boot order of the server. +Deprecated: currently not supported.

@@ -3833,6 +3856,25 @@ string +bootType
+ + +BootType + + + + +(Optional) +

BootType specifies the type of boot configuration. +It can be either “OneOff” or “Persistent”. If not specified, it defaults to “Persistent”. +“Persistent” will set the configuration reboot the server before setting the server to the desired power state mentioned in spec.

+

“OneOff” will set the configuration to boot once. next boot-up will boot into default boot order. +if power state mentioned in spec is “PowerOn”, server will be powered on and will boot into the configuration. +if power state is “PowerOff”, server will be powered off and will boot into the configuration on next power on.

+ + + + spec
@@ -4248,6 +4290,82 @@ ServerMaintenanceStatus +

ServerMaintenanceBootOrder +

+

+(Appears on:ServerMaintenanceStatus) +

+
+
+ + + + + + + + + + + + + + + + + +
FieldDescription
+defaultBootOrder
+ +[]string + +
+(Optional) +

DefaultBootOrder specifies the default boot order of the server before maintenance.

+
+state
+ + +ServerMaintenanceBootOrderState + + +
+
+

ServerMaintenanceBootOrderState +(string alias)

+

+(Appears on:ServerMaintenanceBootOrder) +

+
+

ServerMaintenanceState specifies the current state of the server maintenance.

+
+ + + + + + + + + + + + + + + + + + + + +
ValueDescription

"InProgress"

BootOrderConfigInProgress specifies that the server bootOrder configuration is InProgress.

+

"NoOp"

BootOrderConfigNoOp specifies that the server bootOrder is as expected.

+

"RevertCompleted"

BootOrderConfigRevertSuccess specifies that the server bootOrder configuration is Completed.

+

"Completed"

BootOrderConfigSuccess specifies that the server bootOrder configuration is Completed.

+

"RevertInProgress"

BootOrderConfigSuccessRevertInProgress specifies that the server bootOrder revert to default is InProgress.

+

"PxeOneOffBootSuccess"

BootOrderOneOffPxeBootSuccess specifies that the server bootOrder configuration is set to boot pxe once.

+

ServerMaintenancePolicy (string alias)

@@ -4400,6 +4518,9 @@ ServerBootConfigurationTemplate

"Pending"

ServerMaintenanceStatePending specifies that the server maintenance is pending.

+

"Preparing"

+

ServerMaintenanceStatePreparing specifies that the server is in maintenance.

+

ServerMaintenanceStatus @@ -4431,6 +4552,22 @@ ServerMaintenanceState

State specifies the current state of the server maintenance.

+ + +bootOrderStatus
+ + +ServerMaintenanceBootOrder + + + + +(Optional) +

BootOrderStatus specifies the boot order of the server before maintenance. +is used to store the boot order of server before it is changed for maintenance. +used only with BootType Persistent.

+ +

ServerPowerState @@ -4648,7 +4785,8 @@ the boot configuration for this server during maintenance. This field is optiona (Optional) -

BootOrder specifies the boot order of the server.

+

BootOrder specifies the boot order of the server. +Deprecated: currently not supported.

diff --git a/internal/controller/biossettings_controller.go b/internal/controller/biossettings_controller.go index 3ada7d3c..790d07cd 100644 --- a/internal/controller/biossettings_controller.go +++ b/internal/controller/biossettings_controller.go @@ -41,6 +41,7 @@ type BiosSettingsReconciler struct { BMCOptions bmc.Options ResyncInterval time.Duration TimeoutExpiry time.Duration + ProbeImage string } const ( @@ -460,7 +461,13 @@ func (r *BiosSettingsReconciler) handleSettingInProgressState( } // check if the maintenance is granted - if ok := r.checkIfMaintenanceGranted(log, biosSettings, server); !ok { + state := r.getMaintenanceStateIfApproved(ctx, log, biosSettings, server) + if state == metalv1alpha1.ServerMaintenanceStateFailed { + log.V(1).Info("server maintenance request failed. Please check the ServerMaintenance object") + err := r.updateBiosSettingsStatus(ctx, log, biosSettings, metalv1alpha1.BIOSSettingsStateFailed, nil) + return ctrl.Result{}, err + } + if state != metalv1alpha1.ServerMaintenanceStateInMaintenance { log.V(1).Info("Waiting for maintenance to be granted before continuing with updating settings") return ctrl.Result{}, nil } @@ -1142,31 +1149,43 @@ func (r *BiosSettingsReconciler) checkForRequiredPowerStatus( return server.Status.PowerState == powerState } -func (r *BiosSettingsReconciler) checkIfMaintenanceGranted( +func (r *BiosSettingsReconciler) getMaintenanceStateIfApproved( + ctx context.Context, log logr.Logger, biosSettings *metalv1alpha1.BIOSSettings, server *metalv1alpha1.Server, -) bool { +) metalv1alpha1.ServerMaintenanceState { if biosSettings.Spec.ServerMaintenanceRef == nil { - return true + return "" } if server.Status.State == metalv1alpha1.ServerStateMaintenance { + serverMaintenence, err := r.getReferredServerMaintenance(ctx, log, biosSettings.Spec.ServerMaintenanceRef) + if err != nil { + log.V(1).Info("Failed to get referred ServerMaintenance", "error", err, "serverMaintenanceRef", biosSettings.Spec.ServerMaintenanceRef) + return "" + } + if serverMaintenence.Status.State != metalv1alpha1.ServerMaintenanceStateInMaintenance { + log.V(1).Info("ServerMaintenance is not in maintenance. waiting...", + "serverMaintenance State", serverMaintenence.Status.State, + "serverMaintenance", serverMaintenence.Name) + return serverMaintenence.Status.State + } if server.Spec.ServerMaintenanceRef == nil || server.Spec.ServerMaintenanceRef.UID != biosSettings.Spec.ServerMaintenanceRef.UID { // server in maintenance for other tasks. or // server maintenance ref is wrong in either server or biosSettings // wait for update on the server obj log.V(1).Info("Server is already in maintenance for other tasks", "Server", server.Name, "serverMaintenanceRef", server.Spec.ServerMaintenanceRef) - return false + return serverMaintenence.Status.State } } else { // we still need to wait for server to enter maintenance // wait for update on the server obj log.V(1).Info("Server not yet in maintenance", "Server", server.Name, "State", server.Status.State, "MaintenanceRef", server.Spec.ServerMaintenanceRef) - return false + return "" } - return true + return metalv1alpha1.ServerMaintenanceStateInMaintenance } func (r *BiosSettingsReconciler) requestMaintenanceOnServer( @@ -1190,6 +1209,13 @@ func (r *BiosSettingsReconciler) requestMaintenanceOnServer( serverMaintenance.Spec.Policy = biosSettings.Spec.ServerMaintenancePolicy serverMaintenance.Spec.ServerPower = metalv1alpha1.PowerOn serverMaintenance.Spec.ServerRef = &corev1.LocalObjectReference{Name: server.Name} + serverMaintenance.Spec.ServerBootConfigurationTemplate = &metalv1alpha1.ServerBootConfigurationTemplate{ + Name: biosSettings.Name, + Spec: metalv1alpha1.ServerBootConfigurationSpec{ + ServerRef: corev1.LocalObjectReference{Name: server.Name}, + Image: r.ProbeImage, + }, + } if serverMaintenance.Status.State != metalv1alpha1.ServerMaintenanceStateInMaintenance && serverMaintenance.Status.State != "" { serverMaintenance.Status.State = "" } diff --git a/internal/controller/biossettings_controller_test.go b/internal/controller/biossettings_controller_test.go index 0656c177..1780702c 100644 --- a/internal/controller/biossettings_controller_test.go +++ b/internal/controller/biossettings_controller_test.go @@ -52,7 +52,7 @@ var _ = Describe("BIOSSettings Controller", func() { server = &metalv1alpha1.Server{ ObjectMeta: metav1.ObjectMeta{ Namespace: ns.Name, - GenerateName: "test-maintenance-", + GenerateName: "test-biossettings-", }, Spec: metalv1alpha1.ServerSpec{ SystemUUID: "38947555-7742-3448-3784-823347823834", @@ -315,6 +315,8 @@ var _ = Describe("BIOSSettings Controller", func() { HaveField("Status.LastAppliedTime", BeNil()), )) + _ = MarkBootConfigReady(ctx, k8sClient, serverMaintenance.Name, serverMaintenance.Namespace) + By("Ensuring that the Server is in correct power state") Eventually(Object(server)).Should( HaveField("Status.PowerState", metalv1alpha1.ServerOnPowerState), @@ -428,6 +430,8 @@ var _ = Describe("BIOSSettings Controller", func() { HaveField("Status.State", metalv1alpha1.BIOSSettingsStateInProgress), ) + _ = MarkBootConfigReady(ctx, k8sClient, serverMaintenance.Name, serverMaintenance.Namespace) + By("Ensuring that the Server is in Maintenance") Eventually(Object(server)).Should( HaveField("Status.State", metalv1alpha1.ServerStateMaintenance), @@ -521,6 +525,8 @@ var _ = Describe("BIOSSettings Controller", func() { }), ) + _ = MarkBootConfigReady(ctx, k8sClient, serverMaintenance.Name, serverMaintenance.Namespace) + By("Ensuring that the Server is in Maintenance") Eventually(Object(server)).Should( HaveField("Status.State", metalv1alpha1.ServerStateMaintenance), @@ -600,10 +606,11 @@ var _ = Describe("BIOSSettings Controller", func() { })).Should(Succeed()) By("Ensuring that the biosSettings resource has setting updated, and moved the state") - Eventually(Object(biosSettings)).Should(SatisfyAny( + Eventually(Object(biosSettings)).Should( HaveField("Status.State", metalv1alpha1.BIOSSettingsStateInProgress), - HaveField("Status.State", metalv1alpha1.BIOSSettingsStateApplied), - )) + ) + + _ = MarkBootConfigReady(ctx, k8sClient, biosSettings.Name, ns.Name) // due to nature of mocking, we cant not determine few steps here. hence need a longer wait time Eventually(Object(biosSettings)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BIOSSettingsStateApplied), @@ -678,6 +685,8 @@ var _ = Describe("BIOSSettings Controller", func() { HaveField("Status.State", metalv1alpha1.BIOSSettingsStateInProgress), )) + _ = MarkBootConfigReady(ctx, k8sClient, biosSettings.Name, ns.Name) + Eventually(Object(biosSettings)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BIOSSettingsStateApplied), HaveField("Status.LastAppliedTime.IsZero()", false), @@ -716,7 +725,7 @@ var _ = Describe("BIOSSettings Sequence Controller", func() { server = &metalv1alpha1.Server{ ObjectMeta: metav1.ObjectMeta{ Namespace: ns.Name, - GenerateName: "test-maintenance-", + GenerateName: "test-biossettings-", }, Spec: metalv1alpha1.ServerSpec{ SystemUUID: "38947555-7742-3448-3784-823347823834", @@ -769,6 +778,8 @@ var _ = Describe("BIOSSettings Sequence Controller", func() { } Expect(k8sClient.Create(ctx, biosSettings)).To(Succeed()) + _ = MarkBootConfigReady(ctx, k8sClient, biosSettings.Name, ns.Name) + By("Ensuring that the BIOSSetting Object has moved to completed") Eventually(Object(biosSettings)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BIOSSettingsStateApplied), @@ -910,6 +921,8 @@ var _ = Describe("BIOSSettings Sequence Controller", func() { } Expect(k8sClient.Create(ctx, biosSettings)).To(Succeed()) + _ = MarkBootConfigReady(ctx, k8sClient, biosSettings.Name, ns.Name) + By("Ensuring that the BIOSSetting Object has moved to completed") Eventually(Object(biosSettings)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BIOSSettingsStateApplied), @@ -928,6 +941,8 @@ var _ = Describe("BIOSSettings Sequence Controller", func() { HaveField("Status.State", metalv1alpha1.BIOSSettingsStateInProgress), ) + _ = MarkBootConfigReady(ctx, k8sClient, biosSettings.Name, ns.Name) + By("Ensuring that the BIOSSetting Object has moved to completed") Eventually(Object(biosSettings)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BIOSSettingsStateApplied), @@ -969,6 +984,8 @@ var _ = Describe("BIOSSettings Sequence Controller", func() { } Expect(k8sClient.Create(ctx, biosSettings)).To(Succeed()) + _ = MarkBootConfigReady(ctx, k8sClient, biosSettings.Name, ns.Name) + Eventually(Object(biosSettings)).WithPolling(1 * time.Microsecond).Should( HaveField("Status.State", metalv1alpha1.BIOSSettingsStateInProgress), ) diff --git a/internal/controller/biossettingsset_controller_test.go b/internal/controller/biossettingsset_controller_test.go index 2edba2d6..dc54e24f 100644 --- a/internal/controller/biossettingsset_controller_test.go +++ b/internal/controller/biossettingsset_controller_test.go @@ -39,7 +39,7 @@ var _ = Describe("BIOSSettingsSet Controller", func() { By("Creating a Server") server01 = &metalv1alpha1.Server{ ObjectMeta: metav1.ObjectMeta{ - GenerateName: "test-maintenance-01-", + GenerateName: "test-biossettingsset-01-", Labels: map[string]string{ "metal.ironcore.dev/Manufacturer": "foo", }, @@ -64,7 +64,7 @@ var _ = Describe("BIOSSettingsSet Controller", func() { By("Creating a second Server") server02 = &metalv1alpha1.Server{ ObjectMeta: metav1.ObjectMeta{ - GenerateName: "test-maintenance-02-", + GenerateName: "test-biossettingsset-02-", Labels: map[string]string{ "metal.ironcore.dev/Manufacturer": "bar", }, @@ -89,7 +89,7 @@ var _ = Describe("BIOSSettingsSet Controller", func() { By("Creating a third Server") server03 = &metalv1alpha1.Server{ ObjectMeta: metav1.ObjectMeta{ - GenerateName: "test-maintenance-03-", + GenerateName: "test-biossettingsset-03-", Labels: map[string]string{ "metal.ironcore.dev/Manufacturer": "bar", }, @@ -149,6 +149,8 @@ var _ = Describe("BIOSSettingsSet Controller", func() { } Eventually(Get(biosSettings02)).Should(Succeed()) + _ = MarkBootConfigReady(ctx, k8sClient, biosSettings02.Name, ns.Name) + By("Checking the biosSettings02 have completed") Eventually(Object(biosSettings02)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BIOSSettingsStateApplied), @@ -171,6 +173,8 @@ var _ = Describe("BIOSSettingsSet Controller", func() { } Eventually(Get(biosSettings03)).Should(Succeed()) + _ = MarkBootConfigReady(ctx, k8sClient, biosSettings03.Name, ns.Name) + By("Checking the biosSettings03 have completed") Eventually(Object(biosSettings03)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BIOSSettingsStateApplied), @@ -245,6 +249,8 @@ var _ = Describe("BIOSSettingsSet Controller", func() { HaveField("Status.FailedBIOSSettings", BeNumerically("==", 0)), )) + _ = MarkBootConfigReady(ctx, k8sClient, biosSettings02.Name, ns.Name) + By("Checking the biosSettings02 have completed") Eventually(Object(biosSettings02)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BIOSSettingsStateApplied), @@ -258,6 +264,8 @@ var _ = Describe("BIOSSettingsSet Controller", func() { })), )) + _ = MarkBootConfigReady(ctx, k8sClient, biosSettings03.Name, ns.Name) + By("Checking the biosSettings03 have completed") Eventually(Object(biosSettings03)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BIOSSettingsStateApplied), @@ -305,6 +313,8 @@ var _ = Describe("BIOSSettingsSet Controller", func() { Eventually(Get(biosSettings02)).Should(Succeed()) Eventually(Get(biosSettings03)).Should(Succeed()) + // we do not mark boot config ready for biosSettings02 once again, as + // as the biosSettings will already be applied on it By("Checking the biosSettings02 have completed") Eventually(Object(biosSettings02)).Should( HaveField("Status.State", metalv1alpha1.BIOSSettingsStateApplied), @@ -341,6 +351,10 @@ var _ = Describe("BIOSSettingsSet Controller", func() { HaveField("Status.FailedBIOSSettings", BeNumerically("==", 0)), )) + // we do not mark boot config ready for biosSettings01, as + // the server in unit testing all point to same mockup server + // and Hence, the biosSettings will already be applied on it when other biosSettings are applied. + By("Checking the biosSettings01 have completed") Eventually(Object(biosSettings01)).Should( HaveField("Status.State", metalv1alpha1.BIOSSettingsStateApplied), diff --git a/internal/controller/biosversion_controller.go b/internal/controller/biosversion_controller.go index b665f31c..0390a7c4 100644 --- a/internal/controller/biosversion_controller.go +++ b/internal/controller/biosversion_controller.go @@ -38,6 +38,7 @@ type BIOSVersionReconciler struct { Scheme *runtime.Scheme BMCOptions bmc.Options ResyncInterval time.Duration + ProbeImage string } const ( @@ -209,6 +210,22 @@ func (r *BIOSVersionReconciler) ensureBiosVersionStateTransition( return false, nil } + serverMaintenence, err := r.getReferredServerMaintenance(ctx, log, biosVersion.Spec.ServerMaintenanceRef) + if err != nil { + return false, fmt.Errorf("failed to get referred serverMaintenance obj from BIOSVersion: %w", err) + } + if serverMaintenence.Status.State == metalv1alpha1.ServerMaintenanceStateFailed { + log.V(1).Info("server maintenance request failed. Please check the ServerMaintenance object") + err := r.updateBiosVersionStatus(ctx, log, biosVersion, metalv1alpha1.BIOSVersionStateFailed, nil, nil, nil) + return false, err + } + if serverMaintenence.Status.State != metalv1alpha1.ServerMaintenanceStateInMaintenance { + log.V(1).Info("ServerMaintenance is not in maintenance. waiting...", + "serverMaintenance State", serverMaintenence.Status.State, + "serverMaintenance", serverMaintenence.Name) + return false, nil + } + if server.Spec.ServerMaintenanceRef == nil || server.Spec.ServerMaintenanceRef.UID != biosVersion.Spec.ServerMaintenanceRef.UID { // server in maintenance for other tasks. or // server maintenance ref is wrong in either server or biosSettings @@ -613,6 +630,13 @@ func (r *BIOSVersionReconciler) requestMaintenanceOnServer( serverMaintenance.Spec.Policy = biosVersion.Spec.ServerMaintenancePolicy serverMaintenance.Spec.ServerPower = metalv1alpha1.PowerOn serverMaintenance.Spec.ServerRef = &corev1.LocalObjectReference{Name: server.Name} + serverMaintenance.Spec.ServerBootConfigurationTemplate = &metalv1alpha1.ServerBootConfigurationTemplate{ + Name: biosVersion.Name, + Spec: metalv1alpha1.ServerBootConfigurationSpec{ + ServerRef: corev1.LocalObjectReference{Name: server.Name}, + Image: r.ProbeImage, + }, + } if serverMaintenance.Status.State != metalv1alpha1.ServerMaintenanceStateInMaintenance && serverMaintenance.Status.State != "" { serverMaintenance.Status.State = "" } diff --git a/internal/controller/biosversion_controller_test.go b/internal/controller/biosversion_controller_test.go index 2ccaef4e..b3331bff 100644 --- a/internal/controller/biosversion_controller_test.go +++ b/internal/controller/biosversion_controller_test.go @@ -189,6 +189,7 @@ var _ = Describe("BIOSVersion Controller", func() { APIVersion: "metal.ironcore.dev/v1alpha1", }), )) + MarkBootConfigReady(ctx, k8sClient, serverMaintenance.Name, serverMaintenance.Namespace) ensureBiosVersionConditionTransisition(acc, biosVersion, server) @@ -297,6 +298,7 @@ var _ = Describe("BIOSVersion Controller", func() { APIVersion: "metal.ironcore.dev/v1alpha1", }), )) + MarkBootConfigReady(ctx, k8sClient, serverMaintenance.Name, serverMaintenance.Namespace) ensureBiosVersionConditionTransisition(acc, biosVersion, server) @@ -356,6 +358,8 @@ var _ = Describe("BIOSVersion Controller", func() { HaveField("Status.State", metalv1alpha1.BIOSVersionStateInProgress), ) + MarkBootConfigReady(ctx, k8sClient, biosVersion.Name, ns.Name) + Eventually(Object(biosVersion)).Should( HaveField("Status.State", metalv1alpha1.BIOSVersionStateCompleted), ) diff --git a/internal/controller/biosversionset_controller_test.go b/internal/controller/biosversionset_controller_test.go index eecb7903..dd5b477e 100644 --- a/internal/controller/biosversionset_controller_test.go +++ b/internal/controller/biosversionset_controller_test.go @@ -165,6 +165,20 @@ var _ = Describe("BIOSVersionSet Controller", func() { HaveField("Status.FailedBIOSVersion", BeNumerically("==", 0)), )) + By("Ensuring that the Maintenance resource has been created") + var serverMaintenanceList metalv1alpha1.ServerMaintenanceList + Eventually(ObjectList(&serverMaintenanceList)).Should(HaveField("Items", Not(BeEmpty()))) + + serverMaintenance02 := &metalv1alpha1.ServerMaintenance{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns.Name, + Name: biosVersion02.Name, + }, + } + Eventually(Get(serverMaintenance02)).Should(Succeed()) + + _ = MarkBootConfigReady(ctx, k8sClient, serverMaintenance02.Name, ns.Name) + By("Checking the biosVersion01 have completed") Eventually(Object(biosVersion02)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BIOSVersionStateCompleted), @@ -180,6 +194,16 @@ var _ = Describe("BIOSVersionSet Controller", func() { })), )) + serverMaintenance03 := &metalv1alpha1.ServerMaintenance{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns.Name, + Name: biosVersion03.Name, + }, + } + Eventually(Get(serverMaintenance03)).Should(Succeed()) + + _ = MarkBootConfigReady(ctx, k8sClient, serverMaintenance03.Name, ns.Name) + By("Checking the biosVersion02 have completed") Eventually(Object(biosVersion03)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BIOSVersionStateCompleted), @@ -253,7 +277,21 @@ var _ = Describe("BIOSVersionSet Controller", func() { HaveField("Status.FailedBIOSVersion", BeNumerically("==", 0)), )) - By("Checking the biosVersion01 have completed") + By("Ensuring that the Maintenance resource has been created") + var serverMaintenanceList metalv1alpha1.ServerMaintenanceList + Eventually(ObjectList(&serverMaintenanceList)).Should(HaveField("Items", Not(BeEmpty()))) + + serverMaintenance02 := &metalv1alpha1.ServerMaintenance{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns.Name, + Name: biosVersion02.Name, + }, + } + Eventually(Get(serverMaintenance02)).Should(Succeed()) + + _ = MarkBootConfigReady(ctx, k8sClient, serverMaintenance02.Name, ns.Name) + + By("Checking the biosVersion02 have completed") Eventually(Object(biosVersion02)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BIOSVersionStateCompleted), HaveField("OwnerReferences", ContainElement(metav1.OwnerReference{ @@ -266,7 +304,17 @@ var _ = Describe("BIOSVersionSet Controller", func() { })), )) - By("Checking the biosVersion02 have completed") + serverMaintenance03 := &metalv1alpha1.ServerMaintenance{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns.Name, + Name: biosVersion03.Name, + }, + } + Eventually(Get(serverMaintenance03)).Should(Succeed()) + + _ = MarkBootConfigReady(ctx, k8sClient, serverMaintenance03.Name, ns.Name) + + By("Checking the biosVersion03 have completed") Eventually(Object(biosVersion03)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BIOSVersionStateCompleted), HaveField("OwnerReferences", ContainElement(metav1.OwnerReference{ @@ -312,6 +360,8 @@ var _ = Describe("BIOSVersionSet Controller", func() { Eventually(Get(biosVersion02)).Should(Succeed()) Eventually(Get(biosVersion03)).Should(Succeed()) + // we do not mark boot config ready for biosVersion02 once again, as + // as the biosVersion will already be applied on it By("Checking the biosVersion01 have completed") Eventually(Object(biosVersion02)).Should( HaveField("Status.State", metalv1alpha1.BIOSVersionStateCompleted), @@ -348,6 +398,11 @@ var _ = Describe("BIOSVersionSet Controller", func() { HaveField("Status.FailedBIOSVersion", BeNumerically("==", 0)), )) + // we do not mark boot config ready for biosVersion01, as + // the server in unit testing all point to same mockup server + // and Hence, the biosSettings will already be applied on it when other biosSettings are applied. + // _ = MarkBootConfigReady(ctx, k8sClient, <>.Name, ns.Name) + By("Checking the biosVersion01 have completed") Eventually(Object(biosVersion01)).Should( HaveField("Status.State", metalv1alpha1.BIOSVersionStateCompleted), diff --git a/internal/controller/bmcsettings_controller.go b/internal/controller/bmcsettings_controller.go index ba5b196c..4c3eda0a 100644 --- a/internal/controller/bmcsettings_controller.go +++ b/internal/controller/bmcsettings_controller.go @@ -38,6 +38,7 @@ type BMCSettingsReconciler struct { Insecure bool Scheme *runtime.Scheme BMCOptions bmc.Options + ProbeImage string } const BMCSettingFinalizer = "firmware.ironcore.dev/out-of-band-management" @@ -313,7 +314,12 @@ func (r *BMCSettingsReconciler) handleSettingInProgressState( } // check if the maintenance is granted - if ok := r.checkIfMaintenanceGranted(ctx, log, bmcSetting, bmcClient); !ok { + state := r.getMaintenanceStateIfApproved(ctx, log, bmcSetting, bmcClient) + if state == metalv1alpha1.ServerMaintenanceStateFailed { + err := r.updateBMCSettingsStatus(ctx, log, bmcSetting, metalv1alpha1.BMCSettingsStateFailed) + return ctrl.Result{}, err + } + if state != metalv1alpha1.ServerMaintenanceStateInMaintenance { log.V(1).Info("Waiting for maintenance to be granted before continuing with updating settings") return ctrl.Result{}, err } @@ -490,31 +496,48 @@ func (r *BMCSettingsReconciler) getBMCVersionAndSettingsDifference( return currentBMCVersion, diff, nil } -func (r *BMCSettingsReconciler) checkIfMaintenanceGranted( +func (r *BMCSettingsReconciler) getMaintenanceStateIfApproved( ctx context.Context, log logr.Logger, bmcSetting *metalv1alpha1.BMCSettings, bmcClient bmc.BMC, -) bool { +) metalv1alpha1.ServerMaintenanceState { if bmcSetting.Spec.ServerMaintenanceRefs == nil { - return false + return "" } servers, err := r.getServers(ctx, log, bmcSetting, bmcClient) if err != nil { log.V(1).Error(err, "Failed to get ref. servers to determine maintenance state ") - return false + return "" } if len(bmcSetting.Spec.ServerMaintenanceRefs) != len(servers) { log.V(1).Info("Not all servers have Maintenance", "ServerMaintenanceRefs", bmcSetting.Spec.ServerMaintenanceRefs, "Servers", servers) - return false + return "" } - notInMaintenanceState := make([]string, 0, len(servers)) + notInMaintenanceState := make(map[string]metalv1alpha1.ServerMaintenanceState, len(servers)) for _, server := range servers { if server.Status.State == metalv1alpha1.ServerStateMaintenance { serverMaintenanceRef := r.getServerMaintenanceRefForServer(bmcSetting.Spec.ServerMaintenanceRefs, server.Spec.ServerMaintenanceRef.UID) + serverMaintenance := &metalv1alpha1.ServerMaintenance{} + key := client.ObjectKey{Name: serverMaintenanceRef.Name, Namespace: serverMaintenanceRef.Namespace} + if err := r.Get(ctx, key, serverMaintenance); err != nil { + log.V(1).Error(err, "failed to get referred server's Manitenance", "Server", server.Name, "ServerMaintenanceRef", serverMaintenanceRef) + notInMaintenanceState[server.Name] = "" + } + // fail immediately if any of the server maintenance request failed, as we can not proceed further + if serverMaintenance.Status.State == metalv1alpha1.ServerMaintenanceStateFailed { + // atleast one server maintenance failed + log.V(1).Info("ServerMaintenance request failed", "Server", server.Name, "ServerMaintenance", serverMaintenance.Name) + return metalv1alpha1.ServerMaintenanceStateFailed + } + // this gives us the waiting time for the server to be prepared for maintenance by ServerMaintenance controller + if serverMaintenance.Status.State != metalv1alpha1.ServerMaintenanceStateInMaintenance { + log.V(1).Info("ServerMaintenance not yet in maintenance state", "Server", server.Name, "ServerMaintenance", serverMaintenance.Name, "State", serverMaintenance.Status.State) + notInMaintenanceState[server.Name] = serverMaintenance.Status.State + } if server.Spec.ServerMaintenanceRef == nil || serverMaintenanceRef == nil { // server in maintenance for other tasks. or // server maintenance ref is wrong in either server or bmcSetting @@ -524,13 +547,13 @@ func (r *BMCSettingsReconciler) checkIfMaintenanceGranted( "ServerMaintenanceRef", server.Spec.ServerMaintenanceRef, "BMCSettingMaintenaceRef", serverMaintenanceRef, ) - notInMaintenanceState = append(notInMaintenanceState, server.Name) + notInMaintenanceState[server.Name] = serverMaintenance.Status.State } } else { // we still need to wait for server to enter maintenance // wait for update on the server obj log.V(1).Info("Server not yet in maintenance", "Server", server.Name, "State", server.Status.State, "MaintenanceRef", server.Spec.ServerMaintenanceRef) - notInMaintenanceState = append(notInMaintenanceState, server.Name) + notInMaintenanceState[server.Name] = metalv1alpha1.ServerMaintenanceStatePending } } @@ -538,10 +561,12 @@ func (r *BMCSettingsReconciler) checkIfMaintenanceGranted( log.V(1).Info("Some servers not yet in maintenance", "Required maintenances on servers", bmcSetting.Spec.ServerMaintenanceRefs, "Servers not in maintence", notInMaintenanceState) - return false + // some servers are not yet in maintenance + return metalv1alpha1.ServerMaintenanceStatePending } - return true + // all servers are in maintenance + return metalv1alpha1.ServerMaintenanceStateInMaintenance } func (r *BMCSettingsReconciler) requestMaintenanceOnServers( @@ -605,6 +630,13 @@ func (r *BMCSettingsReconciler) requestMaintenanceOnServers( serverMaintenance.Spec.Policy = bmcSetting.Spec.ServerMaintenancePolicy serverMaintenance.Spec.ServerPower = metalv1alpha1.PowerOn serverMaintenance.Spec.ServerRef = &corev1.LocalObjectReference{Name: server.Name} + serverMaintenance.Spec.ServerBootConfigurationTemplate = &metalv1alpha1.ServerBootConfigurationTemplate{ + Name: bmcSetting.Name, + Spec: metalv1alpha1.ServerBootConfigurationSpec{ + ServerRef: corev1.LocalObjectReference{Name: server.Name}, + Image: r.ProbeImage, + }, + } if serverMaintenance.Status.State != metalv1alpha1.ServerMaintenanceStateInMaintenance && serverMaintenance.Status.State != "" { serverMaintenance.Status.State = "" } diff --git a/internal/controller/bmcsettings_controller_test.go b/internal/controller/bmcsettings_controller_test.go index b045ff0f..76544133 100644 --- a/internal/controller/bmcsettings_controller_test.go +++ b/internal/controller/bmcsettings_controller_test.go @@ -174,6 +174,17 @@ var _ = Describe("BMCSettings Controller", func() { Eventually(Object(bmc)).Should(SatisfyAll( HaveField("Spec.BMCSettingRef", &v1.LocalObjectReference{Name: bmcSettings.Name}), )) + By("Ensuring that the Maintenance resource has been created") + var serverMaintenanceList metalv1alpha1.ServerMaintenanceList + Eventually(ObjectList(&serverMaintenanceList)).Should(HaveField("Items", HaveLen(1))) + + serverMaintenance := &metalv1alpha1.ServerMaintenance{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns.Name, + Name: serverMaintenanceList.Items[0].Name, + }, + } + MarkBootConfigReady(ctx, k8sClient, serverMaintenance.Name, serverMaintenance.Namespace) By("Ensuring that the BMCSettings has reached next state") Eventually(Object(bmcSettings)).Should(SatisfyAny( @@ -260,6 +271,8 @@ var _ = Describe("BMCSettings Controller", func() { metautils.SetAnnotation(serverClaim, metalv1alpha1.ServerMaintenanceApprovalKey, "true") })).Should(Succeed()) + MarkBootConfigReady(ctx, k8sClient, serverMaintenance.Name, serverMaintenance.Namespace) + Eventually(Object(bmcSettings)).Should(SatisfyAny( HaveField("Status.State", metalv1alpha1.BMCSettingsStateInProgress), HaveField("Status.State", metalv1alpha1.BMCSettingsStateApplied), @@ -348,6 +361,8 @@ var _ = Describe("BMCSettings Controller", func() { } Eventually(Get(serverMaintenance)).Should(Succeed()) + MarkBootConfigReady(ctx, k8sClient, serverMaintenance.Name, serverMaintenance.Namespace) + By("Ensuring that the BMCSettings resource hasmoved to next state") Eventually(Object(BMCSettings)).Should(SatisfyAny( HaveField("Status.State", metalv1alpha1.BMCSettingsStateInProgress), @@ -419,6 +434,18 @@ var _ = Describe("BMCSettings Controller", func() { HaveField("Status.State", metalv1alpha1.BMCSettingsStateInProgress), ) + By("Ensuring that the Maintenance resource has been created") + var serverMaintenanceList metalv1alpha1.ServerMaintenanceList + Eventually(ObjectList(&serverMaintenanceList)).Should(HaveField("Items", HaveLen(1))) + + serverMaintenance := &metalv1alpha1.ServerMaintenance{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns.Name, + Name: serverMaintenanceList.Items[0].Name, + }, + } + Eventually(Get(serverMaintenance)).Should(Succeed()) + MarkBootConfigReady(ctx, k8sClient, serverMaintenance.Name, serverMaintenance.Namespace) Eventually(Object(BMCSettings)).Should( HaveField("Status.State", metalv1alpha1.BMCSettingsStateApplied), ) diff --git a/internal/controller/bmcversion_controller.go b/internal/controller/bmcversion_controller.go index 30d79eaa..1681cd27 100644 --- a/internal/controller/bmcversion_controller.go +++ b/internal/controller/bmcversion_controller.go @@ -38,6 +38,7 @@ type BMCVersionReconciler struct { Scheme *runtime.Scheme BMCOptions bmc.Options ResyncInterval time.Duration + ProbeImage string } const ( @@ -242,7 +243,12 @@ func (r *BMCVersionReconciler) ensureBMCVersionStateTransition( } // check if the maintenance is granted - if ok := r.checkIfMaintenanceGranted(ctx, log, bmcClient, bmcVersion); !ok { + state := r.getMaintenanceStateIfApproved(ctx, log, bmcClient, bmcVersion) + if state == metalv1alpha1.ServerMaintenanceStateFailed { + err := r.updateBMCVersionStatus(ctx, log, bmcVersion, metalv1alpha1.BMCVersionStateFailed, nil, nil, nil) + return ctrl.Result{}, err + } + if state != metalv1alpha1.ServerMaintenanceStateInMaintenance { log.V(1).Info("Waiting for maintenance to be granted before continuing with updating bmc version") return ctrl.Result{}, err } @@ -408,33 +414,49 @@ func (r *BMCVersionReconciler) getBMCVersionFromBMC( return currentBMCVersion, nil } -func (r *BMCVersionReconciler) checkIfMaintenanceGranted( +func (r *BMCVersionReconciler) getMaintenanceStateIfApproved( ctx context.Context, log logr.Logger, bmcClient bmc.BMC, bmcVersion *metalv1alpha1.BMCVersion, -) bool { +) metalv1alpha1.ServerMaintenanceState { // todo length if bmcVersion.Spec.ServerMaintenanceRefs == nil { - return true + return "" } servers, err := r.getServers(ctx, log, bmcClient, bmcVersion) if err != nil { log.V(1).Error(err, "Failed to get ref. servers to determine maintenance state ") - return false + return "" } if len(bmcVersion.Spec.ServerMaintenanceRefs) != len(servers) { log.V(1).Info("Not all servers have Maintenance", "ServerMaintenanceRefs", bmcVersion.Spec.ServerMaintenanceRefs, "Servers", servers) - return false + return "" } - notInMaintenanceState := make(map[string]bool, len(servers)) + notInMaintenanceState := make(map[string]metalv1alpha1.ServerMaintenanceState, len(servers)) for _, server := range servers { if server.Status.State == metalv1alpha1.ServerStateMaintenance { serverMaintenanceRef, ok := r.getServerMaintenanceRefForServer(bmcVersion.Spec.ServerMaintenanceRefs, server.Spec.ServerMaintenanceRef.UID) + serverMaintenance := &metalv1alpha1.ServerMaintenance{} + key := client.ObjectKey{Name: serverMaintenanceRef.Name, Namespace: serverMaintenanceRef.Namespace} + if err := r.Get(ctx, key, serverMaintenance); err != nil { + log.V(1).Error(err, "failed to get referred server's Manitenance", "Server", server.Name, "ServerMaintenanceRef", serverMaintenanceRef) + notInMaintenanceState[server.Name] = "" + } + if serverMaintenance.Status.State == metalv1alpha1.ServerMaintenanceStateFailed { + // fail immediately if any of the server maintenance request failed, as we can not proceed further + log.V(1).Info("ServerMaintenance in failed state", "Server", server.Name, "ServerMaintenance", serverMaintenance.Name) + return metalv1alpha1.ServerMaintenanceStateFailed + } + // this gives us the waiting time for the server to be prepared for maintenance by ServerMaintenance controller + if serverMaintenance.Status.State != metalv1alpha1.ServerMaintenanceStateInMaintenance { + log.V(1).Info("ServerMaintenance not yet in maintenance state", "Server", server.Name, "ServerMaintenance", serverMaintenance.Name, "State", serverMaintenance.Status.State) + notInMaintenanceState[server.Name] = serverMaintenance.Status.State + } if server.Spec.ServerMaintenanceRef == nil || !ok || server.Spec.ServerMaintenanceRef.UID != serverMaintenanceRef.UID { // server in maintenance for other tasks. or // server maintenance ref is wrong in either server or bmcVersion @@ -444,22 +466,23 @@ func (r *BMCVersionReconciler) checkIfMaintenanceGranted( "ServerMaintenanceRef", server.Spec.ServerMaintenanceRef, "BMCVersionMaintenaceRef", serverMaintenanceRef, ) - notInMaintenanceState[server.Name] = false + notInMaintenanceState[server.Name] = serverMaintenance.Status.State } } else { // we still need to wait for server to enter maintenance // wait for update on the server obj log.V(1).Info("Server not yet in maintenance", "Server", server.Name, "State", server.Status.State, "MaintenanceRef", server.Spec.ServerMaintenanceRef) - notInMaintenanceState[server.Name] = false + notInMaintenanceState[server.Name] = metalv1alpha1.ServerMaintenanceStatePending } } if len(notInMaintenanceState) > 0 { log.V(1).Info("Some servers not yet in maintenance", "req maintenances on servers", bmcVersion.Spec.ServerMaintenanceRefs) - return false + // waiting for all servers to be in maintenance + return metalv1alpha1.ServerMaintenanceStatePending } - - return true + // all servers in maintenance + return metalv1alpha1.ServerMaintenanceStateInMaintenance } func (r *BMCVersionReconciler) checkVersionAndTransistionState( @@ -764,6 +787,13 @@ func (r *BMCVersionReconciler) requestMaintenanceOnServers( serverMaintenance.Spec.Policy = bmcVersion.Spec.ServerMaintenancePolicy serverMaintenance.Spec.ServerPower = metalv1alpha1.PowerOn serverMaintenance.Spec.ServerRef = &corev1.LocalObjectReference{Name: server.Name} + serverMaintenance.Spec.ServerBootConfigurationTemplate = &metalv1alpha1.ServerBootConfigurationTemplate{ + Name: bmcVersion.Name, + Spec: metalv1alpha1.ServerBootConfigurationSpec{ + ServerRef: corev1.LocalObjectReference{Name: server.Name}, + Image: r.ProbeImage, + }, + } if serverMaintenance.Status.State != metalv1alpha1.ServerMaintenanceStateInMaintenance && serverMaintenance.Status.State != "" { serverMaintenance.Status.State = "" } diff --git a/internal/controller/bmcversion_controller_test.go b/internal/controller/bmcversion_controller_test.go index 52c264bd..7928958a 100644 --- a/internal/controller/bmcversion_controller_test.go +++ b/internal/controller/bmcversion_controller_test.go @@ -203,6 +203,7 @@ var _ = Describe("BMCVersion Controller", func() { }), )) + MarkBootConfigReady(ctx, k8sClient, serverMaintenance.Name, serverMaintenance.Namespace) ensureBMCVersionConditionTransisition(ctx, acc, bmcVersion) By("Ensuring that BMC upgrade has completed") @@ -306,6 +307,8 @@ var _ = Describe("BMCVersion Controller", func() { }), )) + MarkBootConfigReady(ctx, k8sClient, serverMaintenance.Name, serverMaintenance.Namespace) + ensureBMCVersionConditionTransisition(ctx, acc, bmcVersion) By("Ensuring that BMC upgrade has completed") @@ -363,6 +366,17 @@ var _ = Describe("BMCVersion Controller", func() { Eventually(Object(bmcVersion)).Should( HaveField("Status.State", metalv1alpha1.BMCVersionStateInProgress), ) + By("Ensuring that the Maintenance resource has been created") + var serverMaintenanceList metalv1alpha1.ServerMaintenanceList + Eventually(ObjectList(&serverMaintenanceList)).Should(HaveField("Items", HaveLen(1))) + + serverMaintenance := &metalv1alpha1.ServerMaintenance{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns.Name, + Name: serverMaintenanceList.Items[0].Name, + }, + } + MarkBootConfigReady(ctx, k8sClient, serverMaintenance.Name, serverMaintenance.Namespace) Eventually(Object(bmcVersion)).Should( HaveField("Status.State", metalv1alpha1.BMCVersionStateCompleted), diff --git a/internal/controller/bmcversionset_controller.go b/internal/controller/bmcversionset_controller.go index f6754804..89a64113 100644 --- a/internal/controller/bmcversionset_controller.go +++ b/internal/controller/bmcversionset_controller.go @@ -302,7 +302,18 @@ func (r *BMCVersionSetReconciler) patchBMCVersionfromTemplate( continue } opResult, err := controllerutil.CreateOrPatch(ctx, r.Client, &bmcVersion, func() error { - bmcVersion.Spec.BMCVersionTemplate = *bmcVersionTemplate.DeepCopy() + + if bmcVersionTemplate.ServerMaintenanceRefs != nil && + len(bmcVersion.Spec.ServerMaintenanceRefs) == len(bmcVersionTemplate.ServerMaintenanceRefs) { + // find a better solution to handle corner cases + // like the spec provided is equal length but has wrong value. + // this will make bmcVersion create new serverMaintenance and hence this patch will be wrong + bmcVersion.Spec.BMCVersionTemplate = *bmcVersionTemplate.DeepCopy() + } else { + serverMaintenanceRefs := bmcVersion.Spec.ServerMaintenanceRefs + bmcVersion.Spec.BMCVersionTemplate = *bmcVersionTemplate.DeepCopy() + bmcVersion.Spec.ServerMaintenanceRefs = serverMaintenanceRefs + } return nil }) //nolint:errcheck if err != nil { diff --git a/internal/controller/bmcversionset_controller_test.go b/internal/controller/bmcversionset_controller_test.go index 8cbb789e..298f105e 100644 --- a/internal/controller/bmcversionset_controller_test.go +++ b/internal/controller/bmcversionset_controller_test.go @@ -170,6 +170,16 @@ var _ = Describe("BMCVersionSet Controller", func() { HaveField("Status.FailedBMCVersion", BeNumerically("==", 0)), )) + By("Ensuring that the BootConfig resource has been created/ marked ready") + var serverMaintenanceList metalv1alpha1.ServerMaintenanceList + Eventually(ObjectList(&serverMaintenanceList)).Should(HaveField("Items", HaveLen(2))) + for _, serverMaintaince := range serverMaintenanceList.Items { + if metav1.IsControlledBy(&serverMaintaince, bmcVersion02) || metav1.IsControlledBy(&serverMaintaince, bmcVersion03) { + By(fmt.Sprintf("Marking the maintenance %v ", serverMaintaince.Name)) + _ = MarkBootConfigReady(ctx, k8sClient, serverMaintaince.Name, ns.Name) + } + } + By("Checking the bmcVersion02 have completed") Eventually(Object(bmcVersion02)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BMCVersionStateCompleted), @@ -262,6 +272,16 @@ var _ = Describe("BMCVersionSet Controller", func() { HaveField("Status.FailedBMCVersion", BeNumerically("==", 0)), )) + By("Ensuring that the BootConfig resource has been created/ marked ready") + var serverMaintenanceList metalv1alpha1.ServerMaintenanceList + Eventually(ObjectList(&serverMaintenanceList)).Should(HaveField("Items", HaveLen(2))) + for _, serverMaintaince := range serverMaintenanceList.Items { + if metav1.IsControlledBy(&serverMaintaince, bmcVersion02) || metav1.IsControlledBy(&serverMaintaince, bmcVersion03) { + By(fmt.Sprintf("Marking the maintenance %v ", serverMaintaince.Name)) + _ = MarkBootConfigReady(ctx, k8sClient, serverMaintaince.Name, ns.Name) + } + } + By("Checking the bmcVersion02 have completed") Eventually(Object(bmcVersion02)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.BMCVersionStateCompleted), diff --git a/internal/controller/common_test_helpers.go b/internal/controller/common_test_helpers.go index 88ae6e3e..c149adf9 100644 --- a/internal/controller/common_test_helpers.go +++ b/internal/controller/common_test_helpers.go @@ -17,6 +17,7 @@ import ( v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -62,25 +63,7 @@ func TransitionServerFromInitialToAvailableState(ctx context.Context, k8sClient // need long time to create boot config // as we go through multiple reconcile before creating the boot config By("Ensuring the boot configuration has been created") - bootConfig := &metalv1alpha1.ServerBootConfiguration{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: BootConfigNameSpace, - Name: server.Name, - }, - } - Eventually(Get(bootConfig)).Should( - Succeed(), - fmt.Sprintf("Expected to get the bootConfig %v, created by Server %v", bootConfig, server.Name), - ) - Eventually(Object(bootConfig)).Should( - HaveField("Status.State", metalv1alpha1.ServerBootConfigurationStatePending), - "Expected to get the bootConfig to reach pending state") - - By("Patching the boot configuration to a Ready state") - Eventually(UpdateStatus(bootConfig, func() { - bootConfig.Status.State = metalv1alpha1.ServerBootConfigurationStateReady - })).Should(Succeed(), fmt.Sprintf("Unable to set the bootconfig %v to Ready State", bootConfig)) - + bootConfig := MarkBootConfigReady(ctx, k8sClient, server.Name, BootConfigNameSpace) Eventually(Object(bootConfig)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.ServerBootConfigurationStateReady), HaveField("Spec.IgnitionSecretRef", Not(BeNil())), @@ -246,3 +229,22 @@ func CreateServerClaim( } return serverClaim } + +func MarkBootConfigReady(ctx context.Context, k8sClient client.Client, name, namespace string) *metalv1alpha1.ServerBootConfiguration { + bootConfig := &metalv1alpha1.ServerBootConfiguration{} + Eventually(k8sClient.Get).WithArguments(ctx, types.NamespacedName{ + Name: name, + Namespace: namespace, + }, bootConfig).Should(Succeed()) + + Eventually(Object(bootConfig)).Should( + HaveField("Status.State", metalv1alpha1.ServerBootConfigurationStatePending), + "Expected to get the bootConfig to reach pending state", + ) + + By("Patching the boot configuration to a Ready state") + Eventually(UpdateStatus(bootConfig, func() { + bootConfig.Status.State = metalv1alpha1.ServerBootConfigurationStateReady + })).Should(Succeed()) + return bootConfig +} diff --git a/internal/controller/helper.go b/internal/controller/helper.go index 5f23bb82..5d42ca10 100644 --- a/internal/controller/helper.go +++ b/internal/controller/helper.go @@ -8,8 +8,10 @@ import ( "fmt" "math/big" "reflect" + "slices" metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1" + "github.com/stmcginnis/gofish/redfish" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/event" ) @@ -74,6 +76,44 @@ func GenerateRandomPassword(length int) ([]byte, error) { return result, nil } +func rearrangeBootOrder( + currentOrder []string, + currentBootOptions []*redfish.BootOption, + highPriorityBootOrderFunc func(*redfish.BootOption) bool, + removeBootOrderFunc func(*redfish.BootOption) bool, +) []string { + var highPrioirtyBootDevices []*redfish.BootOption + var removedBootOption []*redfish.BootOption + for _, bootOption := range currentBootOptions { + if removeBootOrderFunc(bootOption) { + removedBootOption = append(removedBootOption, bootOption) + } + if highPriorityBootOrderFunc(bootOption) { + highPrioirtyBootDevices = append(highPrioirtyBootDevices, bootOption) + } + } + + newBootOrder := make([]string, 0, len(currentOrder)) + // Add high priority boot devices first + for _, bootDevice := range highPrioirtyBootDevices { + newBootOrder = append(newBootOrder, bootDevice.BootOptionReference) + } + + for _, bootOrder := range currentOrder { + if slices.Contains(newBootOrder, bootOrder) { + continue + } + if slices.ContainsFunc(removedBootOption, func(removed *redfish.BootOption) bool { + return removed.BootOptionReference == bootOrder + }) { + continue + } + newBootOrder = append(newBootOrder, bootOrder) + } + + return newBootOrder +} + func enqueFromChildObjUpdatesExceptAnnotation(e event.UpdateEvent) bool { isNil := func(arg any) bool { if v := reflect.ValueOf(arg); !v.IsValid() || ((v.Kind() == reflect.Ptr || diff --git a/internal/controller/server_controller.go b/internal/controller/server_controller.go index ef48ae25..68f8b02d 100644 --- a/internal/controller/server_controller.go +++ b/internal/controller/server_controller.go @@ -12,7 +12,6 @@ import ( "fmt" "io" "net/http" - "sort" "strings" "time" @@ -199,6 +198,14 @@ func (r *ServerReconciler) reconcile(ctx context.Context, log logr.Logger, serve return ctrl.Result{}, err } + // temp until we remove the unused field: handle deprecated BootOrder field + if len(server.Spec.BootOrder) > 0 { + serverBase := server.DeepCopy() + server.Spec.BootOrder = nil + err := r.Patch(ctx, server, client.MergeFrom(serverBase)) + return ctrl.Result{}, err + } + if modified, err := r.handleAnnotionOperations(ctx, log, bmcClient, server); err != nil || modified { return ctrl.Result{}, err } @@ -234,11 +241,6 @@ func (r *ServerReconciler) reconcile(ctx context.Context, log logr.Logger, serve } log.V(1).Info("Updated Server status") - if err := r.applyBootOrder(ctx, log, bmcClient, server); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update server bios boot order: %w", err) - } - log.V(1).Info("Updated Server BIOS boot order") - requeue, err := r.ensureServerStateTransition(ctx, log, bmcClient, server) if requeue && err == nil { // we need to update the ServerStatus after state transition to make sure it reflects the changes done @@ -1018,34 +1020,6 @@ func (r *ServerReconciler) invalidateRegistryEntryForServer(log logr.Logger, ser return nil } -func (r *ServerReconciler) applyBootOrder(ctx context.Context, log logr.Logger, bmcClient bmc.BMC, server *metalv1alpha1.Server) error { - if server.Spec.BMCRef == nil && server.Spec.BMC == nil { - log.V(1).Info("Server has no BMC connection configured") - return nil - } - - order, err := bmcClient.GetBootOrder(ctx, server.Spec.SystemURI) - if err != nil { - return fmt.Errorf("failed to create BMC client: %w", err) - } - - sort.Slice(server.Spec.BootOrder, func(i, j int) bool { - return server.Spec.BootOrder[i].Priority < server.Spec.BootOrder[j].Priority - }) - newOrder := []string{} - change := false - for i, boot := range server.Spec.BootOrder { - newOrder = append(newOrder, boot.Device) - if order[i] != boot.Device { - change = true - } - } - if change { - return bmcClient.SetBootOrder(ctx, server.Spec.SystemURI, newOrder) - } - return nil -} - func (r *ServerReconciler) handleAnnotionOperations(ctx context.Context, log logr.Logger, bmcClient bmc.BMC, server *metalv1alpha1.Server) (bool, error) { annotations := server.GetAnnotations() operation, ok := annotations[metalv1alpha1.OperationAnnotation] diff --git a/internal/controller/servermaintenance_controller.go b/internal/controller/servermaintenance_controller.go index d6a44344..8c4d6b9e 100644 --- a/internal/controller/servermaintenance_controller.go +++ b/internal/controller/servermaintenance_controller.go @@ -6,11 +6,18 @@ package controller import ( "context" "fmt" + "reflect" + "slices" + "strings" + "time" "github.com/go-logr/logr" "github.com/ironcore-dev/controller-utils/clientutils" "github.com/ironcore-dev/controller-utils/metautils" metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1" + "github.com/ironcore-dev/metal-operator/bmc" + "github.com/ironcore-dev/metal-operator/internal/bmcutils" + "github.com/stmcginnis/gofish/redfish" v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -31,7 +38,15 @@ const ( // ServerMaintenanceReconciler reconciles a ServerMaintenance object type ServerMaintenanceReconciler struct { client.Client - Scheme *runtime.Scheme + Scheme *runtime.Scheme + Insecure bool + ResyncInterval time.Duration + BMCOptions bmc.Options +} + +type bootConfigChangedStatus struct { + changed bool + bootConfig *metalv1alpha1.ServerBootConfiguration } // +kubebuilder:rbac:groups=metal.ironcore.dev,resources=servermaintenances,verbs=get;list;watch;create;update;patch;delete @@ -82,9 +97,8 @@ func (r *ServerMaintenanceReconciler) reconcile(ctx context.Context, log logr.Lo } // set the servermaintenance state to pending if it is not set if serverMaintenance.Status.State == "" { - if modified, err := r.patchMaintenanceState(ctx, serverMaintenance, metalv1alpha1.ServerMaintenanceStatePending); err != nil || modified { - return ctrl.Result{}, err - } + err := r.patchMaintenanceState(ctx, serverMaintenance, metalv1alpha1.ServerMaintenanceStatePending) + return ctrl.Result{}, err } return r.ensureServerMaintenanceStateTransition(ctx, log, serverMaintenance) } @@ -93,8 +107,22 @@ func (r *ServerMaintenanceReconciler) ensureServerMaintenanceStateTransition(ctx switch serverMaintenance.Status.State { case metalv1alpha1.ServerMaintenanceStatePending: return r.handlePendingState(ctx, log, serverMaintenance) + case metalv1alpha1.ServerMaintenanceStatePreparing: + return r.handlePrepareMaintenanceState(ctx, log, serverMaintenance) case metalv1alpha1.ServerMaintenanceStateInMaintenance: - return r.handleInMaintenanceState(ctx, log, serverMaintenance) + log.V(1).Info("Server is under maintenance", "ServerMaintenance", serverMaintenance.Name) + server, err := r.getServerRef(ctx, serverMaintenance) + if err != nil { + return ctrl.Result{}, err + } + + if configStatus, err := r.isBootConfigurationChanged(ctx, log, serverMaintenance); err != nil { + return ctrl.Result{}, err + } else if configStatus.changed { + return r.moveToPrepareMaintenanceState(ctx, log, serverMaintenance, server) + } + err = r.setAndPatchServerPowerState(ctx, log, server, serverMaintenance.Spec.ServerPower) + return ctrl.Result{}, err case metalv1alpha1.ServerMaintenanceStateFailed: return r.handleFailedState(log, serverMaintenance) } @@ -114,13 +142,7 @@ func (r *ServerMaintenanceReconciler) handlePendingState(ctx context.Context, lo } if server.Spec.ServerClaimRef == nil { log.V(1).Info("Server has no claim, move to maintenance right away", "Server", server.Name) - if err = r.updateServerRef(ctx, log, serverMaintenance, server); err != nil { - log.Error(err, "failed to patch server maintenance ref") - return ctrl.Result{}, err - } - if modified, err := r.patchMaintenanceState(ctx, serverMaintenance, metalv1alpha1.ServerMaintenanceStateInMaintenance); err != nil || modified { - return ctrl.Result{}, err - } + return r.moveToPrepareMaintenanceState(ctx, log, serverMaintenance, server) } serverClaim := &metalv1alpha1.ServerClaim{} if err := r.Get(ctx, @@ -150,59 +172,89 @@ func (r *ServerMaintenanceReconciler) handlePendingState(ctx context.Context, lo return ctrl.Result{}, nil } log.V(1).Info("Server approved for maintenance", "Server", server.Name, "Maintenance", serverMaintenance.Name) - if err = r.updateServerRef(ctx, log, serverMaintenance, server); err != nil { - log.Error(err, "failed to patch server maintenance ref") - return ctrl.Result{}, err - } - if modified, err := r.patchMaintenanceState(ctx, serverMaintenance, metalv1alpha1.ServerMaintenanceStateInMaintenance); err != nil || modified { - return ctrl.Result{}, err - } + return r.moveToPrepareMaintenanceState(ctx, log, serverMaintenance, server) } if serverMaintenance.Spec.Policy == metalv1alpha1.ServerMaintenancePolicyEnforced { log.V(1).Info("Enforcing maintenance", "Server", server.Name, "Maintenance", serverMaintenance.Name) - if err = r.updateServerRef(ctx, log, serverMaintenance, server); err != nil { - log.Error(err, "failed to patch server maintenance ref") - return ctrl.Result{}, err - } - if modified, err := r.patchMaintenanceState(ctx, serverMaintenance, metalv1alpha1.ServerMaintenanceStateInMaintenance); err != nil || modified { - return ctrl.Result{}, err - } + return r.moveToPrepareMaintenanceState(ctx, log, serverMaintenance, server) } return ctrl.Result{}, nil } -func (r *ServerMaintenanceReconciler) handleInMaintenanceState(ctx context.Context, log logr.Logger, serverMaintenance *metalv1alpha1.ServerMaintenance) (ctrl.Result, error) { +func (r *ServerMaintenanceReconciler) handlePrepareMaintenanceState(ctx context.Context, log logr.Logger, serverMaintenance *metalv1alpha1.ServerMaintenance) (ctrl.Result, error) { server, err := r.getServerRef(ctx, serverMaintenance) if err != nil { return ctrl.Result{}, err } - config, err := r.applyServerBootConfiguration(ctx, log, serverMaintenance, server) + + // rest of the operation can be done only if the server is in maintenance state + // else, we will hit conflict with ServerClaim controller which will still be asserting Specs on Serevr + if server.Status.State != metalv1alpha1.ServerStateMaintenance { + log.V(1).Info("Waiting for server to be in maintenance state", "Server", server.Name, "CurrentState", server.Status.State) + return ctrl.Result{}, nil + } + + var config *metalv1alpha1.ServerBootConfiguration + configState, err := r.isBootConfigurationChanged(ctx, log, serverMaintenance) if err != nil { return ctrl.Result{}, err } - if config == nil { - if err := r.setAndPatchServerPowerState(ctx, log, server, serverMaintenance); err != nil { + if configState.changed || configState.bootConfig == nil { + // turn server power off before applying new boot configuration + // this will help schedule the job to change the boot order for the server + // note: if the server is already off, this will be a no-op + if server.Spec.Power != metalv1alpha1.PowerOff && server.Status.PowerState != metalv1alpha1.ServerOffPowerState { + if err := r.setAndPatchServerPowerState(ctx, log, server, metalv1alpha1.PowerOff); err != nil { + return ctrl.Result{}, err + } + // requeue to give time to the server to power off + return ctrl.Result{RequeueAfter: r.ResyncInterval}, nil + } + // if no config found or if it has changed, we need to create or path it + config, err = r.applyServerBootConfiguration(ctx, log, serverMaintenance, server) + if err != nil { return ctrl.Result{}, err } - return ctrl.Result{}, nil + } else { + config = configState.bootConfig } - if config.Status.State == metalv1alpha1.ServerBootConfigurationStatePending || config.Status.State == "" { - log.V(1).Info("Server boot configuration is pending", "Server", server.Name) - return ctrl.Result{}, nil + + // Note: this is to check skip of BootConfiguration + // implication of skipping is that the server will still continue to run the OS during Maintenance + // might be subjected to reboots + val, found := serverMaintenance.GetAnnotations()[metalv1alpha1.OperationAnnotation] + if config == nil && (!found || val != metalv1alpha1.SkipBootConfiguration) { + log.V(1).Info("No ServerBootConfigurationTemplate boot configuration", "Server", server.Name) + err = r.patchMaintenanceState(ctx, serverMaintenance, metalv1alpha1.ServerMaintenanceStateFailed) + return ctrl.Result{}, err } - if config.Status.State == metalv1alpha1.ServerBootConfigurationStateError { - if modified, err := r.patchMaintenanceState(ctx, serverMaintenance, metalv1alpha1.ServerMaintenanceStateFailed); err != nil || modified { + + if config != nil { + if config.Status.State == metalv1alpha1.ServerBootConfigurationStatePending || config.Status.State == "" { + log.V(1).Info("Server boot configuration is pending", "Server", server.Name) + return ctrl.Result{}, nil + } + if config.Status.State == metalv1alpha1.ServerBootConfigurationStateError { + err = r.patchMaintenanceState(ctx, serverMaintenance, metalv1alpha1.ServerMaintenanceStateFailed) return ctrl.Result{}, err } - return ctrl.Result{}, nil + if config.Status.State == metalv1alpha1.ServerBootConfigurationStateReady { + log.V(1).Info("Server maintenance boot configuration is ready", "Server", server.Name) + // now we change the boot order to the server and power it on to complete the configuration + if requeue, err := r.configureBootOrder(ctx, log, server, serverMaintenance); err != nil { + return ctrl.Result{}, err + } else if requeue { + return ctrl.Result{RequeueAfter: r.ResyncInterval}, nil + } + } } - if config.Status.State == metalv1alpha1.ServerBootConfigurationStateReady { - log.V(1).Info("Server maintenance boot configuration is ready", "Server", server.Name) - if err := r.setAndPatchServerPowerState(ctx, log, server, serverMaintenance); err != nil { + if server.Spec.Power != serverMaintenance.Spec.ServerPower { + if err := r.setAndPatchServerPowerState(ctx, log, server, serverMaintenance.Spec.ServerPower); err != nil { return ctrl.Result{}, err } } - return ctrl.Result{}, nil + err = r.patchMaintenanceState(ctx, serverMaintenance, metalv1alpha1.ServerMaintenanceStateInMaintenance) + return ctrl.Result{}, err } func (r *ServerMaintenanceReconciler) applyServerBootConfiguration(ctx context.Context, log logr.Logger, maintenance *metalv1alpha1.ServerMaintenance, server *metalv1alpha1.Server) (*metalv1alpha1.ServerBootConfiguration, error) { @@ -220,6 +272,7 @@ func (r *ServerMaintenanceReconciler) applyServerBootConfiguration(ctx context.C } opResult, err := controllerutil.CreateOrPatch(ctx, r.Client, config, func() error { config.Spec = maintenance.Spec.ServerBootConfigurationTemplate.Spec + config.Status.State = "" return controllerutil.SetControllerReference(maintenance, config, r.Scheme) }) if err != nil { @@ -227,7 +280,7 @@ func (r *ServerMaintenanceReconciler) applyServerBootConfiguration(ctx context.C } log.V(1).Info("Created or patched Config", "Config", config.Name, "Operation", opResult) serverBase := server.DeepCopy() - server.Spec.MaintenanceBootConfigurationRef = &v1.ObjectReference{ + server.Spec.BootConfigurationRef = &v1.ObjectReference{ Namespace: config.Namespace, Name: config.Name, UID: config.UID, @@ -240,9 +293,12 @@ func (r *ServerMaintenanceReconciler) applyServerBootConfiguration(ctx context.C return config, nil } -func (r *ServerMaintenanceReconciler) setAndPatchServerPowerState(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server, maintenance *metalv1alpha1.ServerMaintenance) error { +func (r *ServerMaintenanceReconciler) setAndPatchServerPowerState(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server, power metalv1alpha1.Power) error { + if server.Spec.Power == power { + return nil + } serverBase := server.DeepCopy() - server.Spec.Power = maintenance.Spec.ServerPower + server.Spec.Power = power if err := r.Patch(ctx, server, client.MergeFrom(serverBase)); err != nil { return fmt.Errorf("failed to patch server power state: %w", err) } @@ -285,9 +341,27 @@ func (r *ServerMaintenanceReconciler) delete(ctx context.Context, log logr.Logge if err != nil { return ctrl.Result{}, err } - if err := r.cleanup(ctx, log, server); err != nil { + if requeue, err := r.revertMaintenanceBootConfig(ctx, log, server, serverMaintenance); err != nil { + return ctrl.Result{}, err + } else if requeue { + log.V(1).Info("Wait for boot order to be reverted on server", "Server", server.Name) + return ctrl.Result{RequeueAfter: r.ResyncInterval}, nil + } + log.V(1).Info("Boot order reverted on server", "Server", server.Name) + // make sure the server is powered on before removing maintenance + // this will ensure the server is booted into default boot order + if server.Spec.Power != metalv1alpha1.PowerOn && server.Status.PowerState != metalv1alpha1.ServerOnPowerState { + if err := r.setAndPatchServerPowerState(ctx, log, server, metalv1alpha1.PowerOn); err != nil { + return ctrl.Result{}, err + } + // requeue to give time to the server to power off + return ctrl.Result{RequeueAfter: r.ResyncInterval}, nil + } + + if err := r.cleanup(ctx, log, server, serverMaintenance); err != nil { return ctrl.Result{}, err } + log.V(1).Info("Ensuring that the finalizer is removed") if modified, err := clientutils.PatchEnsureNoFinalizer(ctx, r.Client, serverMaintenance, ServerMaintenanceFinalizer); err != nil || modified { return ctrl.Result{}, err @@ -306,17 +380,22 @@ func (r *ServerMaintenanceReconciler) getServerRef(ctx context.Context, serverMa return server, nil } -func (r *ServerMaintenanceReconciler) cleanup(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server) error { +func (r *ServerMaintenanceReconciler) cleanup( + ctx context.Context, + log logr.Logger, + server *metalv1alpha1.Server, + serverMaintenance *metalv1alpha1.ServerMaintenance) error { if server != nil && server.Spec.ServerMaintenanceRef != nil { if err := r.removeMaintenanceRefFromServer(ctx, server); err != nil { log.Error(err, "failed to remove maintenance ref from server") } } - if server.Spec.MaintenanceBootConfigurationRef != nil { + if server.Spec.BootConfigurationRef != nil && + serverMaintenance.Spec.ServerBootConfigurationTemplate != nil { config := &metalv1alpha1.ServerBootConfiguration{ ObjectMeta: metav1.ObjectMeta{ - Name: server.Spec.MaintenanceBootConfigurationRef.Name, - Namespace: server.Spec.MaintenanceBootConfigurationRef.Namespace, + Name: serverMaintenance.Name, + Namespace: serverMaintenance.Namespace, }, } if err := r.Delete(ctx, config); err != nil { @@ -324,6 +403,7 @@ func (r *ServerMaintenanceReconciler) cleanup(ctx context.Context, log logr.Logg return fmt.Errorf("failed to delete serverbootconfig: %w", err) } } + // note: remove the bootConfig this controller set if err := r.removeBootConfigRefFromServer(ctx, log, config, server); err != nil { return fmt.Errorf("failed to remove maintenance boot config ref from server: %w", err) } @@ -348,15 +428,276 @@ func (r *ServerMaintenanceReconciler) cleanup(ctx context.Context, log logr.Logg return nil } -func (r *ServerMaintenanceReconciler) removeBootConfigRefFromServer(ctx context.Context, log logr.Logger, config *metalv1alpha1.ServerBootConfiguration, server *metalv1alpha1.Server) error { +func (r *ServerMaintenanceReconciler) moveToPrepareMaintenanceState( + ctx context.Context, + log logr.Logger, + serverMaintenance *metalv1alpha1.ServerMaintenance, + server *metalv1alpha1.Server, +) (ctrl.Result, error) { + if err := r.updateServerRef(ctx, log, serverMaintenance, server); err != nil { + return ctrl.Result{}, err + } + + err := r.patchMaintenanceState(ctx, serverMaintenance, metalv1alpha1.ServerMaintenanceStatePreparing) + return ctrl.Result{}, err +} + +func (r *ServerMaintenanceReconciler) configureBootOrder(ctx context.Context, log logr.Logger, server *metalv1alpha1.Server, maintenance *metalv1alpha1.ServerMaintenance) (bool, error) { + bmcClient, err := bmcutils.GetBMCClientForServer(ctx, r.Client, server, r.Insecure, r.BMCOptions) + if err != nil { + return false, fmt.Errorf("failed to create BMC client: %w", err) + } + defer bmcClient.Logout() + + switch maintenance.Spec.ServerBootConfigurationTemplate.BootType { + case metalv1alpha1.BootTypeOneOff: + // note: with this option, the next server boot up will enter pxe boot. + // if Maintenance Spec.ServerPower is PowerOn, server will be powered on and will boot into pxe + // if Maintenance Spec.ServerPower is PowerOff, server will be powered off and will boot into pxe on next power on + // we do not need to save the default boot order as the server will boot into default boot order on next boot up + if err := bmcClient.SetPXEBootOnce(ctx, server.Spec.SystemURI); err != nil { + return false, fmt.Errorf("failed to set PXE boot once: %w", err) + } + log.V(1).Info("Configured PXE boot once for server", + "Server", server.Name) + status := &metalv1alpha1.ServerMaintenanceBootOrder{ + State: metalv1alpha1.BootOrderOneOffPxeBootSuccess, + } + if err := r.patchBootOrderStatus(ctx, log, maintenance, status); err != nil { + return false, fmt.Errorf("failed to patch maintenance default boot order: %w", err) + } + // end state, continue with maintenance + return false, nil + case metalv1alpha1.BootTypePersistent: + bootOrder, err := bmcClient.GetBootOrder(ctx, server.Spec.SystemURI) + if err != nil { + return false, fmt.Errorf("failed to get boot order: %w", err) + } + // if the boot order length is less than 2, we cannot change pxe as first boot + if len(bootOrder) < 2 { + log.V(1).Info("boot order for server can not be changed as it has less than 2 boot options", "bootOrder", bootOrder) + status := &metalv1alpha1.ServerMaintenanceBootOrder{ + State: metalv1alpha1.BootOrderConfigNoOp, + } + if err := r.patchBootOrderStatus(ctx, log, maintenance, status); err != nil { + return false, fmt.Errorf("failed to patch maintenance default boot order: %w", err) + } + // end state, continue with maintenance + return false, nil + } + bootOptions, err := bmcClient.GetBootOptions(ctx, server.Spec.SystemURI) + if err != nil { + return false, fmt.Errorf("failed to get boot options: %w", err) + } + + var isDiskBootOption = func(bootOption *redfish.BootOption) bool { + diskIndicators := []string{"/HD(", "/Sata(", "/NVMe(", "/Scsi(", "/USB("} + for _, indicator := range diskIndicators { + if strings.Contains(bootOption.UefiDevicePath, indicator) { + return true + } + } + return false + } + + var isPxeBootOption = func(bootOption *redfish.BootOption) bool { + return strings.Contains(strings.ToLower(bootOption.DisplayName), "pxe") + } + + bootOrderChanged := rearrangeBootOrder(bootOrder, bootOptions, isPxeBootOption, isDiskBootOption) + + if slices.Equal(bootOrder, bootOrderChanged) { + // boot order is already set to pxe first, nothing to do + log.V(1).Info("boot order for server is set to pxe first", + "Server", server.Name, + "BootOrder", bootOrder) + status := &metalv1alpha1.ServerMaintenanceBootOrder{} + if maintenance.Status.BootOrderStatus == nil { + status = &metalv1alpha1.ServerMaintenanceBootOrder{ + State: metalv1alpha1.BootOrderConfigNoOp, + } + } else { + status = maintenance.Status.BootOrderStatus.DeepCopy() + status.State = metalv1alpha1.BootOrderConfigSuccess + } + if err := r.patchBootOrderStatus(ctx, log, maintenance, status); err != nil { + return false, fmt.Errorf("failed to patch maintenance default boot order: %w", err) + } + // end state, continue with maintenance + return false, nil + } + + if maintenance.Status.BootOrderStatus == nil { + // note in this option, the server will always boot into pxe until changed again + // the job needs to be completed, else any other BIOS config job triggered will fail as exsisting job is not completed + if err := bmcClient.SetBootOrder(ctx, server.Spec.SystemURI, redfish.Boot{BootOrder: bootOrderChanged}); err != nil { + return false, fmt.Errorf("failed to set PXE boot once: %w", err) + } + log.V(1).Info("Configured boot order for server", + "Server", server.Name, + "BootType", maintenance.Spec.ServerBootConfigurationTemplate.BootType, + "BootOrder", bootOrderChanged) + // save the current boot order to revert back after maintenance + status := &metalv1alpha1.ServerMaintenanceBootOrder{ + DefaultBootOrder: bootOrder, + State: metalv1alpha1.BootOrderConfigInProgress, + } + if err := r.patchBootOrderStatus(ctx, log, maintenance, status); err != nil { + return false, fmt.Errorf("failed to patch maintenance default boot order: %w", err) + } + log.V(1).Info("Saved default boot order for server to revert post maintenance", "Server", server.Name, "BootOrder", bootOrder) + // requeue to give time to the server to actually do the work and verify it + return true, nil + } + // we need to complete the job to set the boot order correctly. Turn the server power on + if server.Spec.Power != metalv1alpha1.PowerOn && server.Status.PowerState != metalv1alpha1.ServerOnPowerState { + if err := r.setAndPatchServerPowerState(ctx, log, server, metalv1alpha1.PowerOn); err != nil { + return false, err + } + log.V(1).Info("Requested server power On to complete boot order change", + "Server", server.Name) + // requeue to give time to the server to power off + return true, nil + } + log.V(1).Info("wait for boot order to be set on server", "Server", server.Name) + return true, nil + default: + return false, fmt.Errorf("unknown boot type: %s", maintenance.Spec.ServerBootConfigurationTemplate.BootType) + } +} + +func (r *ServerMaintenanceReconciler) revertMaintenanceBootConfig( + ctx context.Context, + log logr.Logger, + server *metalv1alpha1.Server, + serverMaintenance *metalv1alpha1.ServerMaintenance, +) (bool, error) { + // we need to revert the boot order only if it was changed by the maintenance + if serverMaintenance.Status.BootOrderStatus != nil && serverMaintenance.Status.BootOrderStatus.DefaultBootOrder != nil { + bmcClient, err := bmcutils.GetBMCClientForServer(ctx, r.Client, server, r.Insecure, r.BMCOptions) + if err != nil { + return false, fmt.Errorf("failed to create BMC client: %w", err) + } + defer bmcClient.Logout() + bootOrder, err := bmcClient.GetBootOrder(ctx, server.Spec.SystemURI) + if err != nil { + return false, fmt.Errorf("failed to get boot order: %w", err) + } + if serverMaintenance.Status.BootOrderStatus.State == metalv1alpha1.BootOrderConfigSuccess { + // if the server is not off, we need to turn it off first to change the boot order + // this will help schedule the job to change the boot order for the server + // note: if the server is already off, this will be a no-op + if server.Spec.Power != metalv1alpha1.PowerOff || server.Status.PowerState != metalv1alpha1.ServerOffPowerState { + if err := r.setAndPatchServerPowerState(ctx, log, server, metalv1alpha1.PowerOff); err != nil { + return false, err + } + // requeue to give time to the server to power off + return true, nil + } + + // sanitize the boot order to remove any invalid boot options + // this case can happen if the bios settings were changed boot options during maintenance + // we remove only the invalid boot options and keep the rest in same order + // because the settings will take care of new added boot device and append it to end + currentBootDeviceMap := make(map[string]struct{}) + for _, bootDevice := range bootOrder { + currentBootDeviceMap[bootDevice] = struct{}{} + } + var sanitizedBootOrder []string + for _, bootDevice := range serverMaintenance.Status.BootOrderStatus.DefaultBootOrder { + if _, ok := currentBootDeviceMap[bootDevice]; !ok { + log.V(1).Info("Removing invalid boot option from default boot order", + "Server", server.Name, + "BootDevice", bootDevice) + continue + } + sanitizedBootOrder = append(sanitizedBootOrder, bootDevice) + } + + err = bmcClient.SetBootOrder(ctx, server.Spec.SystemURI, redfish.Boot{ + BootOrder: sanitizedBootOrder, + }) + if err != nil { + return false, fmt.Errorf("failed to revert boot order: %w", err) + } + // save the current expected (sanitized) boot order to revert back after maintenance + status := &metalv1alpha1.ServerMaintenanceBootOrder{ + DefaultBootOrder: sanitizedBootOrder, + State: metalv1alpha1.BootOrderConfigSuccessRevertInProgress, + } + if err := r.patchBootOrderStatus(ctx, log, serverMaintenance, status); err != nil { + return false, fmt.Errorf("failed to patch maintenance default boot order: %w", err) + } + log.V(1).Info("Patched revert to default boot order", "Server", server.Name, "BootOrder", nil) + // requeue to give time to the server to actually do the work and verify it + return true, nil + } + + if serverMaintenance.Status.BootOrderStatus.State == metalv1alpha1.BootOrderConfigSuccessRevertInProgress { + if server.Spec.Power != metalv1alpha1.PowerOn || server.Status.PowerState != metalv1alpha1.ServerOnPowerState { + if err := r.setAndPatchServerPowerState(ctx, log, server, metalv1alpha1.PowerOn); err != nil { + return false, err + } + // requeue to give time to the server to power on + return true, nil + } + + revertCompleted := false + if slices.Equal(bootOrder, serverMaintenance.Status.BootOrderStatus.DefaultBootOrder) { + revertCompleted = true + } else { + // sometimes, changing biosSettings leads to change in number of boot options + // hence, we need check the default boot order and ignore extra from current boot order + for idx, bootDevice := range serverMaintenance.Status.BootOrderStatus.DefaultBootOrder { + // cases where boot option is disabled in bios settings + if idx >= len(bootOrder) { + break + } + if bootDevice != bootOrder[idx] { + log.V(1).Info("boot order for server is not yet reverted to default. Waiting...", + "Server", server.Name, + "CurrentBootOrder", bootOrder, + "DefaultBootOrder", serverMaintenance.Status.BootOrderStatus.DefaultBootOrder) + return true, nil + } + } + revertCompleted = true + } + if revertCompleted { + log.V(1).Info("boot order for server has been reverted to default", + "Server", server.Name, + "BootOrder", bootOrder) + if err := r.patchBootOrderStatus(ctx, log, serverMaintenance, nil); err != nil { + return false, fmt.Errorf("failed to patch maintenance default boot order: %w", err) + } + // end state, continue with maintenance complete + return false, nil + } + log.V(1).Info("wait for boot order to be reverted on server", "Server", server.Name) + return true, nil + } + } + + log.V(1).Info("boot override during maintenance is reverted", "Server", serverMaintenance.Name) + return false, nil +} + +func (r *ServerMaintenanceReconciler) removeBootConfigRefFromServer( + ctx context.Context, + log logr.Logger, + config *metalv1alpha1.ServerBootConfiguration, + server *metalv1alpha1.Server, +) error { if server == nil { return nil } - if ref := server.Spec.MaintenanceBootConfigurationRef; ref == nil || (ref.Name != config.Name && ref.Namespace != config.Namespace) { + if ref := server.Spec.BootConfigurationRef; ref == nil || (ref.Name != config.Name && ref.Namespace != config.Namespace) { return nil } serverBase := server.DeepCopy() - server.Spec.MaintenanceBootConfigurationRef = nil + // remove the boot configuration ref by the maintenance only + // this will be replaced by the reserved boot configuration if provided, by serverClaim + server.Spec.BootConfigurationRef = nil if err := r.Patch(ctx, server, client.MergeFrom(serverBase)); err != nil && !apierrors.IsNotFound(err) { return err } @@ -364,6 +705,16 @@ func (r *ServerMaintenanceReconciler) removeBootConfigRefFromServer(ctx context. return nil } +func (r *ServerMaintenanceReconciler) patchBootOrderStatus(ctx context.Context, log logr.Logger, maintenance *metalv1alpha1.ServerMaintenance, bootOrderStatus *metalv1alpha1.ServerMaintenanceBootOrder) error { + maintenanceBase := maintenance.DeepCopy() + maintenance.Status.BootOrderStatus = bootOrderStatus + if err := r.Status().Patch(ctx, maintenance, client.MergeFrom(maintenanceBase)); err != nil { + return fmt.Errorf("failed to patch maintenance default boot order: %w", err) + } + log.V(1).Info("Patched maintenance boot order status", "ServerMaintenance", maintenance.Name, "BootOrderStatus", bootOrderStatus) + return nil +} + func (r *ServerMaintenanceReconciler) removeMaintenanceRefFromServer(ctx context.Context, server *metalv1alpha1.Server) error { serverBase := server.DeepCopy() server.Spec.ServerMaintenanceRef = nil @@ -373,16 +724,16 @@ func (r *ServerMaintenanceReconciler) removeMaintenanceRefFromServer(ctx context return nil } -func (r *ServerMaintenanceReconciler) patchMaintenanceState(ctx context.Context, serverMaintenance *metalv1alpha1.ServerMaintenance, state metalv1alpha1.ServerMaintenanceState) (bool, error) { +func (r *ServerMaintenanceReconciler) patchMaintenanceState(ctx context.Context, serverMaintenance *metalv1alpha1.ServerMaintenance, state metalv1alpha1.ServerMaintenanceState) error { if serverMaintenance.Status.State == state { - return false, nil + return nil } base := serverMaintenance.DeepCopy() serverMaintenance.Status.State = state if err := r.Status().Patch(ctx, serverMaintenance, client.MergeFrom(base)); err != nil { - return false, fmt.Errorf("failed to patch serverMaintenance state: %w", err) + return fmt.Errorf("failed to patch serverMaintenance state: %w", err) } - return true, nil + return nil } func (r *ServerMaintenanceReconciler) patchServerClaimAnnotation(ctx context.Context, log logr.Logger, serverClaim *metalv1alpha1.ServerClaim, set map[string]string) error { @@ -405,6 +756,51 @@ func (r *ServerMaintenanceReconciler) patchServerClaimAnnotation(ctx context.Con return nil } +func (r *ServerMaintenanceReconciler) getMaintenanceBootConfigurationRef( + ctx context.Context, + log logr.Logger, + serverMaintenance *metalv1alpha1.ServerMaintenance, +) (*metalv1alpha1.ServerBootConfiguration, error) { + if serverMaintenance.Spec.ServerBootConfigurationTemplate == nil { + return nil, nil + } + config := &metalv1alpha1.ServerBootConfiguration{} + if err := r.Get(ctx, types.NamespacedName{Name: serverMaintenance.Name, Namespace: serverMaintenance.Namespace}, config); err != nil { + if apierrors.IsNotFound(err) { + log.V(1).Info("ServerBootConfiguration for maintenance not found", "ServerBootConfiguration", serverMaintenance.Name) + return nil, nil + } + return nil, fmt.Errorf("failed to get server boot configuration: %w", err) + } + return config, nil +} + +func (r *ServerMaintenanceReconciler) isBootConfigurationChanged( + ctx context.Context, + log logr.Logger, + serverMaintenance *metalv1alpha1.ServerMaintenance, +) (*bootConfigChangedStatus, error) { + status := bootConfigChangedStatus{ + changed: false, + } + config, err := r.getMaintenanceBootConfigurationRef(ctx, log, serverMaintenance) + status.bootConfig = config + if err != nil { + return &status, err + } + if config == nil { + return &status, nil + } + if reflect.DeepEqual(config.Spec, serverMaintenance.Spec.ServerBootConfigurationTemplate.Spec) { + // no changes + log.V(1).Info("No changes in ServerBootConfigurationTemplate") + return &status, nil + } + status.changed = true + log.V(1).Info("Detected changes in ServerBootConfigurationTemplate") + return &status, nil +} + // SetupWithManager sets up the controller with the Manager. func (r *ServerMaintenanceReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). @@ -421,24 +817,31 @@ func (r *ServerMaintenanceReconciler) enqueueMaintenanceByServerRefs() handler.E server := object.(*metalv1alpha1.Server) var req []reconcile.Request + if server.Spec.ServerMaintenanceRef != nil { + return []reconcile.Request{{ + NamespacedName: types.NamespacedName{Namespace: server.Spec.ServerMaintenanceRef.Namespace, Name: server.Spec.ServerMaintenanceRef.Name}, + }} + } + maintenanceList := &metalv1alpha1.ServerMaintenanceList{} if err := r.List(ctx, maintenanceList); err != nil { log.Error(err, "failed to list host serverMaintenances") return nil } for _, maintenance := range maintenanceList.Items { - if server.Spec.ServerMaintenanceRef != nil && maintenance.Spec.ServerRef.Name == server.Spec.ServerMaintenanceRef.Name { + if server.Spec.ServerMaintenanceRef != nil && maintenance.Name == server.Spec.ServerMaintenanceRef.Name { req = append(req, reconcile.Request{ NamespacedName: types.NamespacedName{Namespace: maintenance.Namespace, Name: maintenance.Name}, }) return req } - if server.Spec.ServerMaintenanceRef == nil { + if maintenance.Spec.ServerRef != nil && maintenance.Spec.ServerRef.Name == server.Name { req = append(req, reconcile.Request{ NamespacedName: types.NamespacedName{Namespace: maintenance.Namespace, Name: maintenance.Name}, }) } } + return req }) } diff --git a/internal/controller/servermaintenance_controller_test.go b/internal/controller/servermaintenance_controller_test.go index bfb54953..0df93a5c 100644 --- a/internal/controller/servermaintenance_controller_test.go +++ b/internal/controller/servermaintenance_controller_test.go @@ -8,7 +8,6 @@ import ( . "github.com/onsi/gomega" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" . "sigs.k8s.io/controller-runtime/pkg/envtest/komega" "github.com/ironcore-dev/controller-utils/metautils" @@ -97,7 +96,7 @@ var _ = Describe("ServerMaintenance Controller", func() { } Expect(k8sClient.Create(ctx, serverMaintenance)).To(Succeed()) Eventually(Object(serverMaintenance)).Should(SatisfyAll( - HaveField("Status.State", metalv1alpha1.ServerMaintenanceStateInMaintenance), + HaveField("Status.State", metalv1alpha1.ServerMaintenanceStatePreparing), )) By("Checking the Server is in maintenance") @@ -153,21 +152,17 @@ var _ = Describe("ServerMaintenance Controller", func() { metalv1alpha1.ServerMaintenanceReasonAnnotationKey: "test-maintenance", metalv1alpha1.ServerMaintenanceApprovalKey: "true", } + By("Checking the ServerMaintenanceRef and BootConfigurationRef") + By("Checking the ") Eventually(Object(server)).Should(SatisfyAll( + HaveField("Spec.ServerMaintenanceRef", Not(BeNil())), HaveField("Spec.ServerMaintenanceRef.Name", serverMaintenance.Name), - HaveField("Spec.MaintenanceBootConfigurationRef", Not(BeNil())), + HaveField("Spec.BootConfigurationRef", Not(BeNil())), + HaveField("Spec.BootConfigurationRef.Name", serverMaintenance.Name), + HaveField("Spec.BootConfigurationRef.Namespace", serverMaintenance.Namespace), )) - bootConfig := &metalv1alpha1.ServerBootConfiguration{} - Eventually(k8sClient.Get).WithArguments(ctx, types.NamespacedName{ - Name: server.Spec.MaintenanceBootConfigurationRef.Name, - Namespace: server.Spec.MaintenanceBootConfigurationRef.Namespace, - }, bootConfig).Should(Succeed()) - - By("Patching the boot configuration to a Ready state") - Eventually(UpdateStatus(bootConfig, func() { - bootConfig.Status.State = metalv1alpha1.ServerBootConfigurationStateReady - })).Should(Succeed()) + MarkBootConfigReady(ctx, k8sClient, server.Spec.BootConfigurationRef.Name, server.Spec.BootConfigurationRef.Namespace) By("Checking the Server is in maintenance") Eventually(Object(server)).Should(SatisfyAll( @@ -177,7 +172,6 @@ var _ = Describe("ServerMaintenance Controller", func() { Eventually(Object(serverClaim)).Should(SatisfyAll( HaveField("ObjectMeta.Annotations", maintenanceLabels), )) - By("Checking the ServerMaintenance is in maintenance") Eventually(Object(serverMaintenance)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.ServerMaintenanceStateInMaintenance), @@ -189,7 +183,7 @@ var _ = Describe("ServerMaintenance Controller", func() { Eventually(Object(server)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.ServerStateReserved), HaveField("Spec.ServerMaintenanceRef", BeNil()), - HaveField("Spec.MaintenanceBootConfigurationRef", BeNil()), + HaveField("Spec.BootConfigurationRef.Name", serverClaim.Name), )) By("Checking the ServerClaim is cleaned up") Eventually(Object(serverClaim)).Should(SatisfyAll( @@ -243,24 +237,24 @@ var _ = Describe("ServerMaintenance Controller", func() { }, } Expect(k8sClient.Create(ctx, serverMaintenance01)).To(Succeed()) - By("Checking the ServerMaintenanceRef") + By("Checking the ServerMaintenanceRef and BootConfigurationRef") Eventually(Object(server)).Should(SatisfyAll( HaveField("Spec.ServerMaintenanceRef", Not(BeNil())), - HaveField("Spec.MaintenanceBootConfigurationRef", Not(BeNil())), + HaveField("Spec.ServerMaintenanceRef.Name", serverMaintenance01.Name), + HaveField("Spec.BootConfigurationRef", Not(BeNil())), + HaveField("Spec.BootConfigurationRef.Name", serverMaintenance01.Name), + HaveField("Spec.BootConfigurationRef.Namespace", serverMaintenance01.Namespace), + )) + By("Checking the ServerMaintenance state is PrepareMaintenance") + Eventually(Object(serverMaintenance01)).Should(SatisfyAll( + HaveField("Status.State", metalv1alpha1.ServerMaintenanceStatePreparing), )) + MarkBootConfigReady(ctx, k8sClient, server.Spec.BootConfigurationRef.Name, server.Spec.BootConfigurationRef.Namespace) + + By("Checking the ServerMaintenance state is PrepareMaintenance") Eventually(Object(serverMaintenance01)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.ServerMaintenanceStateInMaintenance), )) - bootConfig := &metalv1alpha1.ServerBootConfiguration{} - Eventually(k8sClient.Get).WithArguments(ctx, types.NamespacedName{ - Name: server.Spec.MaintenanceBootConfigurationRef.Name, - Namespace: server.Spec.MaintenanceBootConfigurationRef.Namespace, - }, bootConfig).Should(Succeed()) - - By("Patching the boot configuration to a Ready state") - Eventually(UpdateStatus(bootConfig, func() { - bootConfig.Status.State = metalv1alpha1.ServerBootConfigurationStateReady - })).Should(Succeed()) By("Creating a second ServerMaintenance object") Expect(k8sClient.Create(ctx, serverMaintenance02)).To(Succeed()) @@ -280,11 +274,21 @@ var _ = Describe("ServerMaintenance Controller", func() { By("Deleting first ServerMaintenance to finish the maintenance on the server") Eventually(k8sClient.Delete).WithArguments(ctx, serverMaintenance01).Should(Succeed()) + By("Checking the second ServerMaintenance is now PreparingMaintenance") Eventually(Object(serverMaintenance02)).Should(SatisfyAll( - HaveField("Status.State", metalv1alpha1.ServerMaintenanceStateInMaintenance), + HaveField("Status.State", metalv1alpha1.ServerMaintenanceStatePreparing), + )) + + By("Checking the ServerMaintenanceRef and BootConfigurationRef") + Eventually(Object(server)).Should(SatisfyAll( + HaveField("Spec.ServerMaintenanceRef", Not(BeNil())), + HaveField("Spec.ServerMaintenanceRef.Name", serverMaintenance02.Name), + HaveField("Spec.BootConfigurationRef", Not(BeNil())), + HaveField("Spec.BootConfigurationRef.Name", serverMaintenance02.Name), + HaveField("Spec.BootConfigurationRef.Namespace", serverMaintenance02.Namespace), )) + MarkBootConfigReady(ctx, k8sClient, server.Spec.BootConfigurationRef.Name, server.Spec.BootConfigurationRef.Namespace) - By("Checking the second ServerMaintenance is now in maintenance") Eventually(Object(serverMaintenance02)).Should(SatisfyAll( HaveField("Status.State", metalv1alpha1.ServerMaintenanceStateInMaintenance), )) diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index 1cd9fa19..7b7a94e6 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -30,6 +30,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/envtest" . "sigs.k8s.io/controller-runtime/pkg/envtest/komega" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" //+kubebuilder:scaffold:imports ) @@ -118,6 +120,7 @@ func deleteAndList(ctx context.Context, obj client.Object, objList client.Object var _ = BeforeSuite(func() { By("bootstrapping test environment") + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) testEnv = &envtest.Environment{ CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, ErrorIfCRDPathMissing: true, @@ -262,8 +265,15 @@ func SetupTest() *corev1.Namespace { }).SetupWithManager(k8sManager)).To(Succeed()) Expect((&ServerMaintenanceReconciler{ - Client: k8sManager.GetClient(), - Scheme: k8sManager.GetScheme(), + Client: k8sManager.GetClient(), + Scheme: k8sManager.GetScheme(), + ResyncInterval: 10 * time.Millisecond, + BMCOptions: bmc.Options{ + PowerPollingInterval: 50 * time.Millisecond, + PowerPollingTimeout: 200 * time.Millisecond, + BasicAuth: true, + }, + Insecure: true, }).SetupWithManager(k8sManager)).To(Succeed()) Expect((&BiosSettingsReconciler{ @@ -278,6 +288,7 @@ func SetupTest() *corev1.Namespace { BasicAuth: true, }, TimeoutExpiry: 6 * time.Second, + ProbeImage: "foo:latest", }).SetupWithManager(k8sManager)).To(Succeed()) Expect((&BIOSVersionReconciler{ @@ -291,6 +302,7 @@ func SetupTest() *corev1.Namespace { PowerPollingTimeout: 200 * time.Millisecond, BasicAuth: true, }, + ProbeImage: "foo:latest", }).SetupWithManager(k8sManager)).To(Succeed()) Expect((&BIOSVersionSetReconciler{ @@ -309,6 +321,7 @@ func SetupTest() *corev1.Namespace { PowerPollingTimeout: 200 * time.Millisecond, BasicAuth: true, }, + ProbeImage: "foo:latest", }).SetupWithManager(k8sManager)).To(Succeed()) Expect((&BMCVersionReconciler{ @@ -322,6 +335,7 @@ func SetupTest() *corev1.Namespace { PowerPollingTimeout: 200 * time.Millisecond, BasicAuth: true, }, + ProbeImage: "foo:latest", }).SetupWithManager(k8sManager)).To(Succeed()) Expect((&BMCVersionSetReconciler{