From 9c2c1008bd28be5e0ed4986b51105394ff0cab4b Mon Sep 17 00:00:00 2001 From: grokspawn Date: Wed, 11 Feb 2026 16:32:04 -0600 Subject: [PATCH 1/2] abandon evaluation of any new catalogsource image which pathologically restrts Signed-off-by: grokspawn --- pkg/controller/registry/reconciler/grpc.go | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pkg/controller/registry/reconciler/grpc.go b/pkg/controller/registry/reconciler/grpc.go index 57cedd0ebc..fe61a0cc3c 100644 --- a/pkg/controller/registry/reconciler/grpc.go +++ b/pkg/controller/registry/reconciler/grpc.go @@ -30,6 +30,8 @@ const ( CatalogSourceUpdateKey = "catalogsource.operators.coreos.com/update" ServiceHashLabelKey = "olm.service-spec-hash" CatalogPollingRequeuePeriod = 30 * time.Second + // pathologicalRestartCountThreshold is the number of container restarts above which a pod is considered failed for catalog polling. + pathologicalRestartCountThreshold = 10 ) // grpcCatalogSourceDecorator wraps CatalogSource to add additional methods @@ -754,9 +756,25 @@ func swapLabels(pod *corev1.Pod, labelKey, updateKey string) *corev1.Pod { return pod } +// podPathologicallyRestarting returns true if any container in the pod has restarts exceeding pathologicalRestartCountThreshold. +func podPathologicallyRestarting(pod *corev1.Pod) bool { + for _, s := range pod.Status.ContainerStatuses { + if s.RestartCount > pathologicalRestartCountThreshold { + return true + } + } + for _, s := range pod.Status.InitContainerStatuses { + if s.RestartCount > pathologicalRestartCountThreshold { + return true + } + } + // TODO: currently no ephemeral containers in a catalogsource, should we add checks anyway? + return false +} + // podFailed checks whether the pod status is in a failed or unknown state, and deletes the pod if so. func (c *GrpcRegistryReconciler) podFailed(pod *corev1.Pod) (bool, error) { - if pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodUnknown { + if pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodUnknown || podPathologicallyRestarting(pod) { logrus.WithField("UpdatePod", pod.GetName()).Infof("catalog polling result: update pod %s failed to start", pod.GetName()) err := c.removePods([]*corev1.Pod{pod}, pod.GetNamespace()) if err != nil { From 10985516833379af165f2c126aaabd93b3c62924 Mon Sep 17 00:00:00 2001 From: grokspawn Date: Thu, 12 Feb 2026 08:43:09 -0600 Subject: [PATCH 2/2] add CLBO container considerations to detection Signed-off-by: grokspawn --- pkg/controller/registry/reconciler/grpc.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/controller/registry/reconciler/grpc.go b/pkg/controller/registry/reconciler/grpc.go index fe61a0cc3c..024e298691 100644 --- a/pkg/controller/registry/reconciler/grpc.go +++ b/pkg/controller/registry/reconciler/grpc.go @@ -30,8 +30,8 @@ const ( CatalogSourceUpdateKey = "catalogsource.operators.coreos.com/update" ServiceHashLabelKey = "olm.service-spec-hash" CatalogPollingRequeuePeriod = 30 * time.Second - // pathologicalRestartCountThreshold is the number of container restarts above which a pod is considered failed for catalog polling. - pathologicalRestartCountThreshold = 10 + // containerReasonCrashLoopBackOff is the kubelet Waiting reason when a container is backing off after repeated crashes. + containerReasonCrashLoopBackOff = "CrashLoopBackOff" ) // grpcCatalogSourceDecorator wraps CatalogSource to add additional methods @@ -756,15 +756,15 @@ func swapLabels(pod *corev1.Pod, labelKey, updateKey string) *corev1.Pod { return pod } -// podPathologicallyRestarting returns true if any container in the pod has restarts exceeding pathologicalRestartCountThreshold. -func podPathologicallyRestarting(pod *corev1.Pod) bool { +// podContainersArePathological returns true if any container in the pod has restarts exceeding pathologicalRestartCountThreshold or is in CrashLoopBackOff. +func podContainersArePathological(pod *corev1.Pod) bool { for _, s := range pod.Status.ContainerStatuses { - if s.RestartCount > pathologicalRestartCountThreshold { + if s.State.Waiting != nil && s.State.Waiting.Reason == containerReasonCrashLoopBackOff { return true } } for _, s := range pod.Status.InitContainerStatuses { - if s.RestartCount > pathologicalRestartCountThreshold { + if s.State.Waiting != nil && s.State.Waiting.Reason == containerReasonCrashLoopBackOff { return true } } @@ -774,7 +774,7 @@ func podPathologicallyRestarting(pod *corev1.Pod) bool { // podFailed checks whether the pod status is in a failed or unknown state, and deletes the pod if so. func (c *GrpcRegistryReconciler) podFailed(pod *corev1.Pod) (bool, error) { - if pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodUnknown || podPathologicallyRestarting(pod) { + if pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodUnknown || podContainersArePathological(pod) { logrus.WithField("UpdatePod", pod.GetName()).Infof("catalog polling result: update pod %s failed to start", pod.GetName()) err := c.removePods([]*corev1.Pod{pod}, pod.GetNamespace()) if err != nil {