diff --git a/api/v1alpha1/bmc_types.go b/api/v1alpha1/bmc_types.go index 67031471..a27286ee 100644 --- a/api/v1alpha1/bmc_types.go +++ b/api/v1alpha1/bmc_types.go @@ -210,6 +210,14 @@ type BMCStatus struct { // +optional LastResetTime *metav1.Time `json:"lastResetTime,omitempty"` + // MetricsReportSubscriptionLink is the link to the metrics report subscription of the bmc. + // +optional + MetricsReportSubscriptionLink string `json:"metricsReportSubscriptionLink,omitempty"` + + // EventsSubscriptionLink is the link to the events subscription of the bmc. + // +optional + EventsSubscriptionLink string `json:"eventsSubscriptionLink,omitempty"` + // Conditions represents the latest available observations of the BMC's current state. // +patchStrategy=merge // +patchMergeKey=type diff --git a/bmc/bmc.go b/bmc/bmc.go index 72bffb52..7ec47c84 100644 --- a/bmc/bmc.go +++ b/bmc/bmc.go @@ -120,6 +120,12 @@ type BMC interface { // GetBMCUpgradeTask retrieves the task for the BMC upgrade. GetBMCUpgradeTask(ctx context.Context, manufacturer string, taskURI string) (*redfish.Task, error) + + // CreateEventSubscription creates an event subscription for the manager.q + CreateEventSubscription(ctx context.Context, destination string, eventType redfish.EventFormatType, protocol redfish.DeliveryRetryPolicy) (string, error) + + // DeleteEventSubscription deletes an event subscription for the manager. + DeleteEventSubscription(ctx context.Context, uri string) error } type Entity struct { diff --git a/bmc/mock/server/data/Managers/BMC/index.json b/bmc/mock/server/data/Managers/BMC/index.json index be410bbe..c99534fa 100644 --- a/bmc/mock/server/data/Managers/BMC/index.json +++ b/bmc/mock/server/data/Managers/BMC/index.json @@ -4,6 +4,7 @@ "Name": "Manager", "ManagerType": "BMC", "Description": "Contoso BMC", + "Manufacturer": "Contoso", "ServiceEntryPointUUID": "92384634-2938-2342-8820-489239905423", "UUID": "58893887-8974-2487-2389-841168418919", "Model": "Joo Janta 200", @@ -96,4 +97,4 @@ }, "@odata.id": "/redfish/v1/Managers/BMC", "@Redfish.Copyright": "Copyright 2014-2023 DMTF. For the full DMTF copyright policy, see http://www.dmtf.org/about/policies/copyright." -} \ No newline at end of file +} diff --git a/bmc/mock/server/server.go b/bmc/mock/server/server.go index 840a3a5d..d9fc280e 100644 --- a/bmc/mock/server/server.go +++ b/bmc/mock/server/server.go @@ -25,6 +25,14 @@ var ( dataFS embed.FS ) +type Collection struct { + Members []Member `json:"Members"` +} + +type Member struct { + OdataID string `json:"@odata.id"` +} + const ( LockedResourceState = "Locked" UnlockedResourceState = "Unlocked" @@ -81,6 +89,8 @@ func (s *MockServer) redfishHandler(w http.ResponseWriter, r *http.Request) { s.handleRedfishPOST(w, r) case http.MethodPatch: s.handleRedfishPATCH(w, r) + case http.MethodDelete: + s.handleRedfishDelete(w, r) default: http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed) } @@ -180,6 +190,47 @@ func resolvePath(urlPath string) string { return path.Join("data", trimmed, "index.json") } +func (s *MockServer) handleRedfishDelete(w http.ResponseWriter, r *http.Request) { + s.log.Info("Received request", "method", r.Method, "path", r.URL.Path) + + urlPath := resolvePath(r.URL.Path) + s.mu.RLock() + defer s.mu.Unlock() + _, hasOverride := s.overrides[urlPath] + if hasOverride { + // remove the resource + delete(s.overrides, urlPath) + } + // get collection of the resource + collectionPath := path.Dir(urlPath) + cached, hasOverride := s.overrides[collectionPath] + var collection Collection + if hasOverride { + collection = cached.(Collection) + } else { + data, err := dataFS.ReadFile(collectionPath) + if err != nil { + http.NotFound(w, r) + return + } + if err := json.Unmarshal(data, &collection); err != nil { + http.Error(w, "Corrupt embedded JSON", http.StatusInternalServerError) + return + } + } + // remove member from collection + newMembers := make([]Member, 0) + for _, member := range collection.Members { + if member.OdataID != r.URL.Path { + newMembers = append(newMembers, member) + } + } + s.log.Info("Removing member from collection", "members", newMembers, "collection", collectionPath) + collection.Members = newMembers + s.overrides[collectionPath] = collection + w.WriteHeader(http.StatusNoContent) +} + func (s *MockServer) handleRedfishGET(w http.ResponseWriter, r *http.Request) { urlPath := resolvePath(r.URL.Path) @@ -239,9 +290,15 @@ func (s *MockServer) handleRedfishPOST(w http.ResponseWriter, r *http.Request) { } }(r.Body) - s.log.Info("POST body received", "body", string(body)) + var update map[string]any + if err := json.Unmarshal(body, &update); err != nil { + http.Error(w, "Invalid JSON", http.StatusBadRequest) + return + } + s.log.Info("POST body received", "body", string(body)) urlPath := resolvePath(r.URL.Path) + switch { case strings.Contains(urlPath, "Actions/ComputerSystem.Reset"): // Simulate a system reset action @@ -304,7 +361,55 @@ func (s *MockServer) handleRedfishPOST(w http.ResponseWriter, r *http.Request) { case strings.Contains(urlPath, "UpdateService/Actions/UpdateService.SimpleUpdate"): // Simulate a firmware update action default: - s.log.Info("Unhandled POST request", "path", urlPath) + // Handle resource creation in collections + s.mu.Lock() + defer s.mu.Unlock() + cached, hasOverride := s.overrides[urlPath] + var base Collection + if hasOverride { + s.log.Info("Using overridden data for POST", "path", urlPath) + base = cached.(Collection) + } else { + s.log.Info("Using embedded data for POST", "path", urlPath) + data, err := dataFS.ReadFile(urlPath) + if err != nil { + s.log.Error(err, "Failed to read embedded data for POST", "path", urlPath) + http.NotFound(w, r) + return + } + if err := json.Unmarshal(data, &base); err != nil { + http.Error(w, "Corrupt embedded JSON", http.StatusInternalServerError) + return + } + } + // If resource collection (has "Members"), add a new member + if len(base.Members) > 0 { + newID := fmt.Sprintf("%d", len(base.Members)+1) + location := path.Join(r.URL.Path, newID) + newMemberPath := resolvePath(location) + base.Members = append(base.Members, Member{ + OdataID: location, + }) + s.log.Info("Adding new member", "id", newID, "location", location, "memberPath", newMemberPath) + if strings.HasSuffix(r.URL.Path, "/Subscriptions") { + w.Header().Set("Location", location) + } + s.overrides[urlPath] = base + s.overrides[newMemberPath] = update + } else { + base.Members = make([]Member, 0) + location := r.URL.JoinPath("1").String() + base.Members = []Member{ + { + OdataID: r.URL.JoinPath("1").String(), + }, + } + s.overrides[urlPath] = base + if strings.HasSuffix(r.URL.Path, "/Subscriptions") { + w.Header().Set("Location", location) + } + } + s.log.Info("Storing updated data for POST", "path", urlPath, "data", update) w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusCreated) _, err = w.Write([]byte(`{"status": "created"}`)) diff --git a/bmc/redfish.go b/bmc/redfish.go index f6f8292e..46fa999d 100644 --- a/bmc/redfish.go +++ b/bmc/redfish.go @@ -10,6 +10,7 @@ import ( "fmt" "io" "net/http" + "net/url" "slices" "strings" "time" @@ -996,3 +997,80 @@ func (r *RedfishBMC) GetBMCUpgradeTask(ctx context.Context, manufacturer string, return oem.GetTaskMonitorDetails(ctx, respTask) } + +type subscriptionPayload struct { + Destination string `json:"Destination,omitempty"` + EventTypes []redfish.EventType `json:"EventTypes,omitempty"` + EventFormatType redfish.EventFormatType `json:"EventFormatType,omitempty"` + RegistryPrefixes []string `json:"RegistryPrefixes,omitempty"` + ResourceTypes []string `json:"ResourceTypes,omitempty"` + DeliveryRetryPolicy redfish.DeliveryRetryPolicy `json:"DeliveryRetryPolicy,omitempty"` + HTTPHeaders map[string]string `json:"HttpHeaders,omitempty"` + Oem interface{} `json:"Oem,omitempty"` + Protocol redfish.EventDestinationProtocol `json:"Protocol,omitempty"` + Context string `json:"Context,omitempty"` +} + +func (r *RedfishBMC) CreateEventSubscription( + ctx context.Context, + destination string, + eventFormatType redfish.EventFormatType, + retry redfish.DeliveryRetryPolicy, +) (string, error) { + service := r.client.GetService() + ev, err := service.EventService() + if err != nil { + return "", fmt.Errorf("failed to get event service: %w", err) + } + if !ev.ServiceEnabled { + return "", fmt.Errorf("event service is not enabled") + } + payload := &subscriptionPayload{ + Destination: destination, + EventFormatType: eventFormatType, // event or metricreport + Protocol: redfish.RedfishEventDestinationProtocol, + DeliveryRetryPolicy: retry, + Context: "metal3-operator", + } + client := ev.GetClient() + // some implementations (like Dell) do not support ResourceTypes and RegistryPrefixes + if len(ev.ResourceTypes) == 0 { + payload.EventTypes = []redfish.EventType{} + } else { + payload.RegistryPrefixes = []string{""} // Filters by the prefix of the event's MessageId, which points to a Message Registry: [Base, ResourceEvent, iLOEvents] + payload.ResourceTypes = []string{""} // Filters by the schema name (Resource Type) of the event's OriginOfCondition: [Chassis, ComputerSystem, Power] + } + resp, err := client.Post(ev.Subscriptions, payload) + if err != nil { + return "", err + } + // return subscription link from returned location + subscriptionLink := resp.Header.Get("Location") + urlParser, err := url.ParseRequestURI(subscriptionLink) + if err == nil { + subscriptionLink = urlParser.RequestURI() + } + return subscriptionLink, nil +} + +func (r RedfishBMC) DeleteEventSubscription(ctx context.Context, uri string) error { + service := r.client.GetService() + ev, err := service.EventService() + if err != nil { + return fmt.Errorf("failed to get event service: %w", err) + } + if !ev.ServiceEnabled { + return fmt.Errorf("event service is not enabled") + } + event, err := ev.GetEventSubscription(uri) + if err != nil { + return fmt.Errorf("failed to get event subscription: %w", err) + } + if event == nil { + return nil + } + if err := ev.DeleteEventSubscription(uri); err != nil { + return fmt.Errorf("failed to delete event subscription: %w", err) + } + return nil +} diff --git a/cmd/manager/main.go b/cmd/manager/main.go index de4a8119..2777b3e2 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -13,6 +13,7 @@ import ( "time" "github.com/ironcore-dev/controller-utils/conditionutils" + "github.com/ironcore-dev/metal-operator/internal/serverevents" webhookmetalv1alpha1 "github.com/ironcore-dev/metal-operator/internal/webhook/v1alpha1" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -73,6 +74,9 @@ func main() { // nolint: gocyclo registryPort int registryProtocol string registryURL string + eventPort int + eventURL string + eventProtocol string registryResyncInterval time.Duration webhookPort int enforceFirstBoot bool @@ -118,6 +122,10 @@ func main() { // nolint: gocyclo flag.StringVar(®istryURL, "registry-url", "", "The URL of the registry.") flag.StringVar(®istryProtocol, "registry-protocol", "http", "The protocol to use for the registry.") flag.IntVar(®istryPort, "registry-port", 10000, "The port to use for the registry.") + flag.StringVar(&eventURL, "event-url", "", "The URL of the server events endpoint for alerts and metrics.") + flag.IntVar(&eventPort, "event-port", 10001, "The port to use for the server events endpoint for alerts and metrics.") + flag.StringVar(&eventProtocol, "event-protocol", "http", + "The protocol to use for the server events endpoint for alerts and metrics.") flag.StringVar(&probeImage, "probe-image", "", "Image for the first boot probing of a Server.") flag.StringVar(&probeOSImage, "probe-os-image", "", "OS image for the first boot probing of a Server.") flag.StringVar(&managerNamespace, "manager-namespace", "default", "Namespace the manager is running in.") @@ -189,6 +197,15 @@ func main() { // nolint: gocyclo registryURL = fmt.Sprintf("%s://%s:%d", registryProtocol, registryAddr, registryPort) } + // set the correct event URL by getting the address from the environment + var eventAddr string + if eventURL == "" { + eventAddr = os.Getenv("EVENT_ADDRESS") + if eventAddr != "" { + eventURL = fmt.Sprintf("%s://%s:%d", eventProtocol, eventAddr, eventPort) + } + } + // if the enable-http2 flag is false (the default), http/2 should be disabled // due to its vulnerabilities. More specifically, disabling http/2 will // prevent from being vulnerable to the HTTP/2 Stream Cancelation and @@ -329,6 +346,7 @@ func main() { // nolint: gocyclo BMCResetWaitTime: bmcResetWaitingInterval, BMCClientRetryInterval: bmcResetResyncInterval, ManagerNamespace: managerNamespace, + EventURL: eventURL, Conditions: conditionutils.NewAccessor(conditionutils.AccessorOptions{}), BMCOptions: bmc.Options{ BasicAuth: true, @@ -554,6 +572,17 @@ func main() { // nolint: gocyclo os.Exit(1) } + if eventAddr != "" { + setupLog.Info("starting event server for alerts and metrics", "EventURL", eventURL) + eventServer := serverevents.NewServer(setupLog, fmt.Sprintf(":%d", eventPort)) + go func() { + if err := eventServer.Start(ctx); err != nil { + setupLog.Error(err, "problem running event server") + os.Exit(1) + } + }() + } + setupLog.Info("starting manager") if err := mgr.Start(ctx); err != nil { setupLog.Error(err, "problem running manager") diff --git a/config/crd/bases/metal.ironcore.dev_bmcs.yaml b/config/crd/bases/metal.ironcore.dev_bmcs.yaml index 1c754891..adc86bb2 100644 --- a/config/crd/bases/metal.ironcore.dev_bmcs.yaml +++ b/config/crd/bases/metal.ironcore.dev_bmcs.yaml @@ -255,6 +255,10 @@ spec: - type type: object type: array + eventsSubscriptionLink: + description: EventsSubscriptionLink is the link to the events subscription + of the bmc. + type: string firmwareVersion: description: FirmwareVersion is the version of the firmware currently running on the BMC. @@ -278,6 +282,10 @@ spec: manufacturer: description: Manufacturer is the name of the BMC manufacturer. type: string + metricsReportSubscriptionLink: + description: MetricsReportSubscriptionLink is the link to the metrics + report subscription of the bmc. + type: string model: description: Model is the model number or name of the BMC. type: string diff --git a/dist/chart/templates/crd/metal.ironcore.dev_bmcs.yaml b/dist/chart/templates/crd/metal.ironcore.dev_bmcs.yaml index 59413940..5916991a 100755 --- a/dist/chart/templates/crd/metal.ironcore.dev_bmcs.yaml +++ b/dist/chart/templates/crd/metal.ironcore.dev_bmcs.yaml @@ -261,6 +261,10 @@ spec: - type type: object type: array + eventsSubscriptionLink: + description: EventsSubscriptionLink is the link to the events subscription + of the bmc. + type: string firmwareVersion: description: FirmwareVersion is the version of the firmware currently running on the BMC. @@ -284,6 +288,10 @@ spec: manufacturer: description: Manufacturer is the name of the BMC manufacturer. type: string + metricsReportSubscriptionLink: + description: MetricsReportSubscriptionLink is the link to the metrics + report subscription of the bmc. + type: string model: description: Model is the model number or name of the BMC. type: string diff --git a/docs/api-reference/api.md b/docs/api-reference/api.md index 77e8830b..85338779 100644 --- a/docs/api-reference/api.md +++ b/docs/api-reference/api.md @@ -590,6 +590,8 @@ _Appears in:_ | `state` _[BMCState](#bmcstate)_ | State represents the current state of the BMC.
kubebuilder:validation:Enum=Enabled;Error;Pending | Pending | | | `powerState` _[BMCPowerState](#bmcpowerstate)_ | PowerState represents the current power state of the BMC. | | | | `lastResetTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.34/#time-v1-meta)_ | LastResetTime is the timestamp of the last reset operation performed on the BMC. | | | +| `metricsReportSubscriptionLink` _string_ | MetricsReportSubscriptionLink is the link to the metrics report subscription of the bmc. | | | +| `eventsSubscriptionLink` _string_ | EventsSubscriptionLink is the link to the events subscription of the bmc. | | | | `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.34/#condition-v1-meta) array_ | Conditions represents the latest available observations of the BMC's current state. | | | diff --git a/internal/controller/bmc_controller.go b/internal/controller/bmc_controller.go index c8098a9d..465dd77f 100644 --- a/internal/controller/bmc_controller.go +++ b/internal/controller/bmc_controller.go @@ -21,6 +21,7 @@ import ( metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1" "github.com/ironcore-dev/metal-operator/bmc" "github.com/ironcore-dev/metal-operator/internal/bmcutils" + "github.com/ironcore-dev/metal-operator/internal/serverevents" "github.com/stmcginnis/gofish/common" "github.com/stmcginnis/gofish/redfish" @@ -59,6 +60,7 @@ type BMCReconciler struct { BMCFailureResetDelay time.Duration BMCOptions bmc.Options ManagerNamespace string + EventURL string // BMCResetWaitTime defines the duration to wait after a BMC reset before attempting reconciliation again. BMCResetWaitTime time.Duration // BMCClientRetryInterval defines the duration to requeue reconciliation after a BMC client error/reset/unavailablility. @@ -103,6 +105,14 @@ func (r *BMCReconciler) delete(ctx context.Context, log logr.Logger, bmcObj *met } } + bmcClient, err := bmcutils.GetBMCClientFromBMC(ctx, r.Client, bmcObj, r.Insecure, r.BMCOptions) + if err == nil { + defer bmcClient.Logout() + if err := r.deleteEventSubscription(ctx, log, bmcClient, bmcObj); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to delete event subscriptions: %w", err) + } + } + if _, err := clientutils.PatchEnsureNoFinalizer(ctx, r.Client, bmcObj, BMCFinalizer); err != nil { return ctrl.Result{}, err } @@ -172,6 +182,10 @@ func (r *BMCReconciler) reconcile(ctx context.Context, log logr.Logger, bmcObj * } log.V(1).Info("Discovered servers") + if modified, err := r.handleEventSubscriptions(ctx, log, bmcClient, bmcObj); err != nil || modified { + return ctrl.Result{}, err + } + log.V(1).Info("Reconciled BMC") return ctrl.Result{}, nil } @@ -449,6 +463,55 @@ func (r *BMCReconciler) updateConditions(ctx context.Context, bmcObj *metalv1alp return nil } +func (r *BMCReconciler) handleEventSubscriptions(ctx context.Context, log logr.Logger, bmcClient bmc.BMC, bmc *metalv1alpha1.BMC) (bool, error) { + if r.EventURL == "" { + return false, nil + } + log.V(1).Info("Handling event subscriptions for BMC") + metricsReportLink := "" + eventLink := "" + + if bmc.Status.MetricsReportSubscriptionLink != "" && bmc.Status.EventsSubscriptionLink != "" { + return false, nil + } + metricsReportLink, err := serverevents.SubscribeMetricsReport(ctx, r.EventURL, bmc.Name, bmcClient) + if err != nil { + return false, fmt.Errorf("failed to subscribe to server metrics report: %w", err) + } + eventLink, err = serverevents.SubscribeEvents(ctx, r.EventURL, bmc.Name, bmcClient) + if err != nil { + return false, fmt.Errorf("failed to subscribe to server alerts: %w", err) + } + log.Info("event subscriptions created", "metricsReportLink", metricsReportLink, "eventLink", eventLink) + bmcBase := bmc.DeepCopy() + bmc.Status.MetricsReportSubscriptionLink = metricsReportLink + bmc.Status.EventsSubscriptionLink = eventLink + if err := r.Status().Patch(ctx, bmc, client.MergeFrom(bmcBase)); err != nil { + return false, fmt.Errorf("failed to patch server status with subscription links: %w", err) + } + log.Info("Subscribed to server events and metrics") + return true, nil +} + +func (r *BMCReconciler) deleteEventSubscription(ctx context.Context, log logr.Logger, bmcClient bmc.BMC, bmc *metalv1alpha1.BMC) error { + if r.EventURL == "" { + return nil + } + if bmc.Status.MetricsReportSubscriptionLink != "" { + if err := bmcClient.DeleteEventSubscription(ctx, bmc.Status.MetricsReportSubscriptionLink); err != nil { + return fmt.Errorf("failed to unsubscribe from server metrics report: %w", err) + } + log.V(1).Info("Unsubscribed from server metrics report") + } + if bmc.Status.EventsSubscriptionLink != "" { + if err := bmcClient.DeleteEventSubscription(ctx, bmc.Status.EventsSubscriptionLink); err != nil { + return fmt.Errorf("failed to unsubscribe from server events: %w", err) + } + log.V(1).Info("Unsubscribed from server events") + } + return nil +} + func (r *BMCReconciler) enqueueBMCByEndpoint(ctx context.Context, obj client.Object) []ctrl.Request { return []ctrl.Request{ { diff --git a/internal/controller/bmc_controller_test.go b/internal/controller/bmc_controller_test.go index 38e94c4c..ffb5f0ec 100644 --- a/internal/controller/bmc_controller_test.go +++ b/internal/controller/bmc_controller_test.go @@ -61,6 +61,8 @@ var _ = Describe("BMC Controller", func() { HaveField("Status.State", metalv1alpha1.BMCStateEnabled), HaveField("Status.PowerState", metalv1alpha1.OnPowerState), HaveField("Status.FirmwareVersion", "1.45.455b66-rev4"), + HaveField("Status.MetricsReportSubscriptionLink", Equal("/redfish/v1/EventService/Subscriptions/5")), + HaveField("Status.EventsSubscriptionLink", Equal("/redfish/v1/EventService/Subscriptions/6")), )) By("Ensuring that the Server resource will be created") @@ -143,6 +145,8 @@ var _ = Describe("BMC Controller", func() { HaveField("Status.State", metalv1alpha1.BMCStateEnabled), HaveField("Status.PowerState", metalv1alpha1.OnPowerState), HaveField("Status.FirmwareVersion", "1.45.455b66-rev4"), + HaveField("Status.MetricsReportSubscriptionLink", Equal("/redfish/v1/EventService/Subscriptions/5")), + HaveField("Status.EventsSubscriptionLink", Equal("/redfish/v1/EventService/Subscriptions/6")), )) By("Ensuring that the Server resource has been created") diff --git a/internal/controller/server_controller.go b/internal/controller/server_controller.go index f564c8ab..236eb62a 100644 --- a/internal/controller/server_controller.go +++ b/internal/controller/server_controller.go @@ -169,7 +169,6 @@ func (r *ServerReconciler) delete(ctx context.Context, log logr.Logger, server * return ctrl.Result{}, err } log.V(1).Info("Ensured that the finalizer has been removed") - log.V(1).Info("Deleted server") return ctrl.Result{}, nil } diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index ebe6a61a..e77a4400 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -14,6 +14,7 @@ import ( "github.com/ironcore-dev/controller-utils/conditionutils" metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1" + "github.com/ironcore-dev/metal-operator/bmc" "github.com/ironcore-dev/metal-operator/bmc/mock/server" "github.com/ironcore-dev/metal-operator/internal/api/macdb" @@ -171,6 +172,7 @@ func SetupTest(redfishMockServers []netip.AddrPort) *corev1.Namespace { ManagerNamespace: ns.Name, BMCResetWaitTime: 400 * time.Millisecond, BMCClientRetryInterval: 25 * time.Millisecond, + EventURL: "http://localhost:8008", Conditions: accessor, }).SetupWithManager(k8sManager)).To(Succeed()) diff --git a/internal/serverevents/server.go b/internal/serverevents/server.go new file mode 100644 index 00000000..1340a7df --- /dev/null +++ b/internal/serverevents/server.go @@ -0,0 +1,173 @@ +// SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package serverevents + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "path" + "strconv" + + "github.com/go-logr/logr" + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +type Server struct { + addr string + mux *http.ServeMux + log logr.Logger +} + +var ( + alertsGauge = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "redfish_alerts_total", + Help: "Number of redfish alerts", + }, + []string{"hostname", "vendor", "severity"}, + ) + metricsReportCollectors map[string]*prometheus.GaugeVec +) + +type MetricsReport struct { + MetricsValues []MetricsValue `json:"MetricsValues"` +} + +type MetricsValue struct { + MetricId string `json:"MetricId"` + MetricProperty string `json:"MetricProperty"` + MetricValue string `json:"MetricValue"` + Timestamp string `json:"Timestamp"` + Oem interface{} `json:"Oem"` +} + +type EventData struct { + Events []Event `json:"Alerts"` + Name string `json:"Name"` +} + +type Event struct { + EventID string `json:"EventId"` + Message string `json:"Message"` + Severity string `json:"Severity"` + EventTimestamp string `json:"EventTimestamp"` +} + +func init() { + // Register custom metrics with the global prometheus registry + metrics.Registry.MustRegister(alertsGauge) +} + +func NewServer(log logr.Logger, addr string) *Server { + mux := http.NewServeMux() + server := &Server{ + addr: addr, + mux: mux, + log: log, + } + server.routes() + return server +} + +func (s *Server) routes() { + s.mux.HandleFunc("/serverevents/alerts", s.alertHandler) + s.mux.HandleFunc("/serverevents/metricsreport", s.metricsreportHandler) +} + +func (s *Server) alertHandler(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "Only POST method is allowed", http.StatusMethodNotAllowed) + return + } + s.log.Info("Received alert data") + // expected path: /serverevents/alerts/{vendor}/{hostname} + hostname := path.Base(r.URL.Path) + vendor := path.Base(path.Dir(r.URL.Path)) + eventData := EventData{} + if err := json.NewDecoder(r.Body).Decode(&eventData); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + totalWarnings := 0 + totalCriticals := 0 + for _, event := range eventData.Events { + if event.Severity == "Warning" { + totalWarnings++ + } + if event.Severity == "Critical" { + totalCriticals++ + } + } + alertsGauge.WithLabelValues(hostname, vendor, "Warning").Set(float64(totalWarnings)) + alertsGauge.WithLabelValues(hostname, vendor, "Critical").Set(float64(totalCriticals)) + w.WriteHeader(http.StatusOK) +} + +func (s *Server) metricsreportHandler(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "Only POST method is allowed", http.StatusMethodNotAllowed) + return + } + // expected path: /serverevents/metricsreport/{vendor}/{hostname} + hostname := path.Base(r.URL.Path) + vendor := path.Base(path.Dir(r.URL.Path)) + s.log.Info("receieved metrics report", "hostname", hostname) + metricsReport := MetricsReport{} + if err := json.NewDecoder(r.Body).Decode(&metricsReport); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + } + for _, mv := range metricsReport.MetricsValues { + if _, ok := metricsReportCollectors[mv.MetricId]; !ok { + gauge := prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: mv.MetricProperty, + Help: "Metric with ID " + mv.MetricId, + }, + []string{"hostname", "vendor"}, + ) + metricsReportCollectors[mv.MetricId] = gauge + metrics.Registry.MustRegister(gauge) + } + floatVal, err := strconv.ParseFloat(mv.MetricValue, 64) + if err != nil { + if mv.MetricValue == "Up" || mv.MetricValue == "Operational" { + floatVal = 1 + } + } + metricsReportCollectors[mv.MetricId].WithLabelValues(hostname, vendor).Set(floatVal) + s.log.Info("Metric", "id", mv.MetricId, "property", mv.MetricProperty, "value", mv.MetricValue, "timestamp", mv.Timestamp) + } + w.WriteHeader(http.StatusOK) +} + +// Start starts the server on the specified address and adds logging for key events. +func (s *Server) Start(ctx context.Context) error { + s.log.Info("Starting registry server", "address", s.addr) + server := &http.Server{Addr: s.addr, Handler: s.mux} + + errChan := make(chan error, 1) + go func() { + if err := server.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { + errChan <- fmt.Errorf("HTTP registry server ListenAndServe: %w", err) + } + }() + select { + case <-ctx.Done(): + s.log.Info("Shutting down registry server...") + if err := server.Shutdown(ctx); err != nil { + return fmt.Errorf("HTTP server Shutdown: %w", err) + } + s.log.Info("Registry server graciously stopped") + return nil + case err := <-errChan: + if shutdownErr := server.Shutdown(ctx); shutdownErr != nil { + s.log.Error(shutdownErr, "Error shutting down registry server") + } + return err + } +} diff --git a/internal/serverevents/subscription.go b/internal/serverevents/subscription.go new file mode 100644 index 00000000..2a4d05c1 --- /dev/null +++ b/internal/serverevents/subscription.go @@ -0,0 +1,40 @@ +// SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package serverevents + +import ( + "context" + "fmt" + + "github.com/ironcore-dev/metal-operator/bmc" + "github.com/stmcginnis/gofish/redfish" +) + +// SubscribeMetricsReport subscribes to Redfish metric reporting events for the given hostname and callback URL. +func SubscribeMetricsReport(ctx context.Context, url, hostname string, bmcClient bmc.BMC) (string, error) { + link, err := bmcClient.CreateEventSubscription( + ctx, + fmt.Sprintf("%s/%s/metrics", url, hostname), + redfish.MetricReportEventFormatType, + redfish.TerminateAfterRetriesDeliveryRetryPolicy, + ) + if err != nil { + return "", fmt.Errorf("failed to create event subscription: %w", err) + } + return link, nil +} + +// SubscribeEvents creates a Redfish event subscription for events. +func SubscribeEvents(ctx context.Context, url, hostname string, bmcClient bmc.BMC) (string, error) { + link, err := bmcClient.CreateEventSubscription( + ctx, + fmt.Sprintf("%s/%s/alerts", url, hostname), + redfish.EventEventFormatType, + redfish.TerminateAfterRetriesDeliveryRetryPolicy, + ) + if err != nil { + return "", fmt.Errorf("failed to create alert subscription: %w", err) + } + return link, nil +} diff --git a/test/serverevents/main.go b/test/serverevents/main.go new file mode 100644 index 00000000..28088dfb --- /dev/null +++ b/test/serverevents/main.go @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "flag" + "os" + "time" + + "github.com/ironcore-dev/metal-operator/internal/serverevents" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log/zap" +) + +var ( + setupLog = ctrl.Log.WithName("setup") +) + +func main() { + var registryURL string + var serverUUID string + var duration time.Duration + + flag.StringVar(®istryURL, "registry-url", "", "Registry URL where the probe will register itself.") + flag.StringVar(&serverUUID, "server-uuid", "", "Agent UUID to register with the registry.") + flag.DurationVar(&duration, "duration", 5*time.Second, "Duration of time to wait between checks.") + + opts := zap.Options{ + Development: true, + } + opts.BindFlags(flag.CommandLine) + flag.Parse() + + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + + if serverUUID == "" { + setupLog.Error(nil, "server uuid is missing") + os.Exit(1) + } + + if registryURL == "" { + setupLog.Error(nil, "registry URL is missing") + os.Exit(1) + } + + ctx := ctrl.SetupSignalHandler() + + setupLog.Info("starting serverevent agent") + server := serverevents.NewServer(setupLog, ":8888") + + if err := server.Start(ctx); err != nil { + setupLog.Error(err, "problem running telemetry server") + os.Exit(1) + } +}