diff --git a/pkg/controller/registry/reconciler/grpc.go b/pkg/controller/registry/reconciler/grpc.go index 57cedd0ebc..024e298691 100644 --- a/pkg/controller/registry/reconciler/grpc.go +++ b/pkg/controller/registry/reconciler/grpc.go @@ -30,6 +30,8 @@ const ( CatalogSourceUpdateKey = "catalogsource.operators.coreos.com/update" ServiceHashLabelKey = "olm.service-spec-hash" CatalogPollingRequeuePeriod = 30 * time.Second + // containerReasonCrashLoopBackOff is the kubelet Waiting reason when a container is backing off after repeated crashes. + containerReasonCrashLoopBackOff = "CrashLoopBackOff" ) // grpcCatalogSourceDecorator wraps CatalogSource to add additional methods @@ -754,9 +756,25 @@ func swapLabels(pod *corev1.Pod, labelKey, updateKey string) *corev1.Pod { return pod } +// podContainersArePathological returns true if any container in the pod has restarts exceeding pathologicalRestartCountThreshold or is in CrashLoopBackOff. +func podContainersArePathological(pod *corev1.Pod) bool { + for _, s := range pod.Status.ContainerStatuses { + if s.State.Waiting != nil && s.State.Waiting.Reason == containerReasonCrashLoopBackOff { + return true + } + } + for _, s := range pod.Status.InitContainerStatuses { + if s.State.Waiting != nil && s.State.Waiting.Reason == containerReasonCrashLoopBackOff { + return true + } + } + // TODO: currently no ephemeral containers in a catalogsource, should we add checks anyway? + return false +} + // podFailed checks whether the pod status is in a failed or unknown state, and deletes the pod if so. func (c *GrpcRegistryReconciler) podFailed(pod *corev1.Pod) (bool, error) { - if pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodUnknown { + if pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodUnknown || podContainersArePathological(pod) { logrus.WithField("UpdatePod", pod.GetName()).Infof("catalog polling result: update pod %s failed to start", pod.GetName()) err := c.removePods([]*corev1.Pod{pod}, pod.GetNamespace()) if err != nil {