From 914cdce75f54505955a3bde798a7fd16371d225b Mon Sep 17 00:00:00 2001 From: Per Goncalves da Silva Date: Fri, 13 Feb 2026 14:28:12 +0100 Subject: [PATCH] Remove ephemeral aspects ofnetwork errors on Progressing condition Signed-off-by: Per Goncalves da Silva --- .../core/clustercatalog_controller.go | 3 +- .../controllers/common_controller.go | 2 +- .../controllers/common_controller_test.go | 45 ++++ internal/shared/util/error/sanitize.go | 26 ++ internal/shared/util/error/sanitize_test.go | 235 ++++++++++++++++++ 5 files changed, 309 insertions(+), 2 deletions(-) create mode 100644 internal/shared/util/error/sanitize.go create mode 100644 internal/shared/util/error/sanitize_test.go diff --git a/internal/catalogd/controllers/core/clustercatalog_controller.go b/internal/catalogd/controllers/core/clustercatalog_controller.go index 3d7fd935c1..fedfe500f0 100644 --- a/internal/catalogd/controllers/core/clustercatalog_controller.go +++ b/internal/catalogd/controllers/core/clustercatalog_controller.go @@ -40,6 +40,7 @@ import ( ocv1 "github.com/operator-framework/operator-controller/api/v1" "github.com/operator-framework/operator-controller/internal/catalogd/storage" + errorutil "github.com/operator-framework/operator-controller/internal/shared/util/error" imageutil "github.com/operator-framework/operator-controller/internal/shared/util/image" k8sutil "github.com/operator-framework/operator-controller/internal/shared/util/k8s" ) @@ -336,7 +337,7 @@ func updateStatusProgressing(status *ocv1.ClusterCatalogStatus, generation int64 if err != nil { progressingCond.Status = metav1.ConditionTrue progressingCond.Reason = ocv1.ReasonRetrying - progressingCond.Message = err.Error() + progressingCond.Message = errorutil.SanitizeNetworkError(err) } if errors.Is(err, reconcile.TerminalError(nil)) { diff --git a/internal/operator-controller/controllers/common_controller.go b/internal/operator-controller/controllers/common_controller.go index cb6834c6b6..e1991e4d16 100644 --- a/internal/operator-controller/controllers/common_controller.go +++ b/internal/operator-controller/controllers/common_controller.go @@ -157,7 +157,7 @@ func setStatusProgressing(ext *ocv1.ClusterExtension, err error) { if err != nil { progressingCond.Reason = ocv1.ReasonRetrying // Unwrap TerminalError to avoid "terminal error:" prefix in message - progressingCond.Message = errorutil.UnwrapTerminal(err).Error() + progressingCond.Message = errorutil.SanitizeNetworkError(errorutil.UnwrapTerminal(err)) } if errors.Is(err, reconcile.TerminalError(nil)) { diff --git a/internal/operator-controller/controllers/common_controller_test.go b/internal/operator-controller/controllers/common_controller_test.go index 3cc7575430..2055a2b891 100644 --- a/internal/operator-controller/controllers/common_controller_test.go +++ b/internal/operator-controller/controllers/common_controller_test.go @@ -3,6 +3,7 @@ package controllers import ( "errors" "fmt" + "net" "strings" "testing" @@ -68,6 +69,50 @@ func TestSetStatusProgressing(t *testing.T) { Message: "missing required field", }, }, + { + name: "non-nil ClusterExtension, non-terminal network error with source port, Progressing condition message has source port stripped", + err: fmt.Errorf("source catalog content: %w", &net.OpError{ + Op: "read", + Net: "tcp", + Source: &net.TCPAddr{ + IP: net.ParseIP("10.0.0.1"), + Port: 52341, + }, + Addr: &net.TCPAddr{ + IP: net.ParseIP("192.168.1.100"), + Port: 443, + }, + Err: fmt.Errorf("connect: connection refused"), + }), + clusterExtension: &ocv1.ClusterExtension{}, + expected: metav1.Condition{ + Type: ocv1.TypeProgressing, + Status: metav1.ConditionTrue, + Reason: ocv1.ReasonRetrying, + Message: "source catalog content: connect: connection refused", + }, + }, + { + name: "non-nil ClusterExtension, non-terminal DNS error, Progressing condition message is sanitized", + err: fmt.Errorf("source catalog content: %w", &net.OpError{ + Op: "dial", + Net: "tcp", + Err: &net.DNSError{ + IsTemporary: true, + IsTimeout: true, + Server: "10.96.0.10:53", + Name: "registry.example.com", + Err: "read udp 10.244.0.8:46753->10.96.0.10:53: i/o timeout", + }, + }), + clusterExtension: &ocv1.ClusterExtension{}, + expected: metav1.Condition{ + Type: ocv1.TypeProgressing, + Status: metav1.ConditionTrue, + Reason: ocv1.ReasonRetrying, + Message: "source catalog content: dial tcp: lookup registry.example.com on 10.96.0.10:53: i/o timeout", + }, + }, } { t.Run(tc.name, func(t *testing.T) { setStatusProgressing(tc.clusterExtension, tc.err) diff --git a/internal/shared/util/error/sanitize.go b/internal/shared/util/error/sanitize.go new file mode 100644 index 0000000000..dc1cd7ee01 --- /dev/null +++ b/internal/shared/util/error/sanitize.go @@ -0,0 +1,26 @@ +package error + +import ( + "regexp" +) + +var ephemeralNetworkErrorPattern = regexp.MustCompile(`(read|write) (tcp|udp) ((?:[0-9]{1,3}(?:\.[0-9]{1,3}){3}|\[[0-9a-fA-F:]+\])(?::\d+)?)->((?:[0-9]{1,3}(?:\.[0-9]{1,3}){3}|\[[0-9a-fA-F:]+\])(?::\d+)?)(: )?`) + +// SanitizeNetworkError returns a stable, deterministic error message for network +// errors by stripping ephemeral details (such as source and destination addresses +// and ports) from low-level socket operations. This ensures consistent error +// messages across retries, preventing unnecessary status condition updates when +// the only change is an ephemeral source port. +// +// The function uses a regex to remove substrings matching the pattern +// "read/write tcp/udp ->: " (with IPv4 or IPv6 addresses), which +// commonly appear inside [net.DNSError] Err fields (e.g., +// "read udp 10.244.0.8:46753->10.96.0.10:53: i/o timeout"). +// +// Returns "" for nil errors, or the sanitized error string otherwise. +func SanitizeNetworkError(err error) string { + if err == nil { + return "" + } + return ephemeralNetworkErrorPattern.ReplaceAllString(err.Error(), "") +} diff --git a/internal/shared/util/error/sanitize_test.go b/internal/shared/util/error/sanitize_test.go new file mode 100644 index 0000000000..c7ef92d95d --- /dev/null +++ b/internal/shared/util/error/sanitize_test.go @@ -0,0 +1,235 @@ +package error + +import ( + "fmt" + "net" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestSanitizeNetworkError(t *testing.T) { + for _, tc := range []struct { + name string + err error + expected string + }{ + { + name: "nil error returns empty string", + err: nil, + expected: "", + }, + { + name: "non-network error returns original message", + err: fmt.Errorf("some random error"), + expected: "some random error", + }, + { + name: "strips read udp address pattern from DNS inner error", + err: &net.OpError{ + Op: "dial", + Net: "tcp", + Err: &net.DNSError{ + IsNotFound: false, + IsTemporary: true, + IsTimeout: true, + Server: "10.96.0.10:53", + Name: "docker-registry.operator-controller.svc", + Err: "read udp 10.244.0.8:46753->10.96.0.10:53: i/o timeout", + }, + }, + expected: "dial tcp: lookup docker-registry.operator-controller.svc on 10.96.0.10:53: i/o timeout", + }, + { + name: "strips read udp pattern from wrapped error preserving outer context", + err: fmt.Errorf("source catalog content: error creating image source: %w", &net.OpError{ + Op: "dial", + Net: "tcp", + Err: &net.DNSError{ + IsNotFound: false, + IsTemporary: true, + IsTimeout: true, + Server: "10.96.0.10:53", + Name: "docker-registry.operator-controller.svc", + Err: "read udp 10.244.0.8:46753->10.96.0.10:53: i/o timeout", + }, + }), + expected: "source catalog content: error creating image source: dial tcp: lookup docker-registry.operator-controller.svc on 10.96.0.10:53: i/o timeout", + }, + { + name: "net.DNSError with IsNotFound unchanged", + err: &net.DNSError{ + IsNotFound: true, + Server: "10.96.0.10:53", + Name: "registry.example.com", + Err: "no such host", + }, + expected: "lookup registry.example.com on 10.96.0.10:53: no such host", + }, + { + name: "net.DNSError without server unchanged", + err: &net.DNSError{ + IsNotFound: true, + Name: "registry.example.com", + Err: "no such host", + }, + expected: "lookup registry.example.com: no such host", + }, + { + name: "net.DNSError without read/write pattern unchanged", + err: &net.DNSError{ + Server: "10.96.0.10:53", + Name: "registry.example.com", + Err: "server misbehaving", + }, + expected: "lookup registry.example.com on 10.96.0.10:53: server misbehaving", + }, + { + name: "wrapped net.DNSError without read/write pattern preserves wrapping", + err: fmt.Errorf("source catalog content: %w", &net.DNSError{ + Server: "10.96.0.10:53", + Name: "registry.example.com", + Err: "server misbehaving", + }), + expected: "source catalog content: lookup registry.example.com on 10.96.0.10:53: server misbehaving", + }, + { + name: "net.DNSError with both IsTimeout and IsNotFound unchanged", + err: &net.DNSError{ + IsTimeout: true, + IsNotFound: true, + Server: "10.96.0.10:53", + Name: "registry.example.com", + Err: "i/o timeout", + }, + expected: "lookup registry.example.com on 10.96.0.10:53: i/o timeout", + }, + { + name: "net.DNSError without server unchanged", + err: &net.DNSError{ + IsTimeout: true, + Name: "registry.example.com", + Err: "i/o timeout", + }, + expected: "lookup registry.example.com: i/o timeout", + }, + { + name: "net.OpError dial with source and addr unchanged", + err: &net.OpError{ + Op: "dial", + Net: "tcp", + Source: &net.TCPAddr{ + IP: net.ParseIP("10.0.0.1"), + Port: 52341, + }, + Addr: &net.TCPAddr{ + IP: net.ParseIP("192.168.1.100"), + Port: 443, + }, + Err: fmt.Errorf("connect: connection refused"), + }, + expected: "dial tcp 10.0.0.1:52341->192.168.1.100:443: connect: connection refused", + }, + { + name: "net.OpError dial without source unchanged", + err: &net.OpError{ + Op: "dial", + Net: "tcp", + Addr: &net.TCPAddr{ + IP: net.ParseIP("192.168.1.100"), + Port: 443, + }, + Err: fmt.Errorf("connect: connection refused"), + }, + expected: "dial tcp 192.168.1.100:443: connect: connection refused", + }, + { + name: "wrapped net.OpError dial preserves full error string", + err: fmt.Errorf("source catalog content: error creating image source: %w", &net.OpError{ + Op: "dial", + Net: "tcp", + Source: &net.TCPAddr{ + IP: net.ParseIP("10.0.0.1"), + Port: 52341, + }, + Addr: &net.TCPAddr{ + IP: net.ParseIP("192.168.1.100"), + Port: 443, + }, + Err: fmt.Errorf("connect: connection refused"), + }), + expected: "source catalog content: error creating image source: dial tcp 10.0.0.1:52341->192.168.1.100:443: connect: connection refused", + }, + { + name: "strips read tcp pattern", + err: fmt.Errorf("read tcp 10.0.0.1:52341->192.168.1.100:443: connection reset by peer"), + expected: "connection reset by peer", + }, + { + name: "strips write tcp pattern", + err: fmt.Errorf("write tcp 10.0.0.1:52341->192.168.1.100:443: broken pipe"), + expected: "broken pipe", + }, + { + name: "strips read udp pattern with IPv6 addresses", + err: fmt.Errorf("read udp [::1]:52341->[fd00::1]:53: i/o timeout"), + expected: "i/o timeout", + }, + } { + t.Run(tc.name, func(t *testing.T) { + result := SanitizeNetworkError(tc.err) + assert.Equal(t, tc.expected, result) + }) + } +} + +func TestSanitizeNetworkErrorConsistency(t *testing.T) { + // Simulate DNS errors with different ephemeral ports in the read udp pattern + makeError := func(sourcePort int) error { + return &net.OpError{ + Op: "dial", + Net: "tcp", + Err: &net.DNSError{ + IsTemporary: true, + IsTimeout: true, + Server: "10.96.0.10:53", + Name: "docker-registry.operator-controller.svc", + Err: fmt.Sprintf("read udp 10.244.0.8:%d->10.96.0.10:53: i/o timeout", sourcePort), + }, + } + } + + // Different source ports should produce the same sanitized message + msg1 := SanitizeNetworkError(makeError(46753)) + msg2 := SanitizeNetworkError(makeError(51234)) + msg3 := SanitizeNetworkError(makeError(60000)) + + assert.Equal(t, msg1, msg2, "sanitized messages with different source ports should be equal") + assert.Equal(t, msg2, msg3, "sanitized messages with different source ports should be equal") + assert.Equal(t, "dial tcp: lookup docker-registry.operator-controller.svc on 10.96.0.10:53: i/o timeout", msg1) +} + +func TestSanitizeNetworkErrorDNSConsistency(t *testing.T) { + // Simulate DNS errors with different ephemeral ports wrapped in context + makeError := func(sourcePort int) error { + return fmt.Errorf("source catalog content: error creating image source: %w", &net.OpError{ + Op: "dial", + Net: "tcp", + Err: &net.DNSError{ + IsTemporary: true, + IsTimeout: true, + Server: "10.96.0.10:53", + Name: "registry.example.com", + Err: fmt.Sprintf("read udp 10.244.0.8:%d->10.96.0.10:53: i/o timeout", sourcePort), + }, + }) + } + + msg1 := SanitizeNetworkError(makeError(46753)) + msg2 := SanitizeNetworkError(makeError(51234)) + msg3 := SanitizeNetworkError(makeError(60000)) + + assert.Equal(t, msg1, msg2, "sanitized DNS messages with different source ports should be equal") + assert.Equal(t, msg2, msg3, "sanitized DNS messages with different source ports should be equal") + assert.Equal(t, "source catalog content: error creating image source: dial tcp: lookup registry.example.com on 10.96.0.10:53: i/o timeout", msg1) +}