Skip to content

Commit 2890347

Browse files
author
Per Goncalves da Silva
committed
Remove ephemeral aspects ofnetwork errors on Progressing condition
Signed-off-by: Per Goncalves da Silva <pegoncal@redhat.com>
1 parent 10c2841 commit 2890347

File tree

5 files changed

+332
-2
lines changed

5 files changed

+332
-2
lines changed

internal/catalogd/controllers/core/clustercatalog_controller.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import (
4040

4141
ocv1 "github.com/operator-framework/operator-controller/api/v1"
4242
"github.com/operator-framework/operator-controller/internal/catalogd/storage"
43+
errorutil "github.com/operator-framework/operator-controller/internal/shared/util/error"
4344
imageutil "github.com/operator-framework/operator-controller/internal/shared/util/image"
4445
k8sutil "github.com/operator-framework/operator-controller/internal/shared/util/k8s"
4546
)
@@ -336,7 +337,7 @@ func updateStatusProgressing(status *ocv1.ClusterCatalogStatus, generation int64
336337
if err != nil {
337338
progressingCond.Status = metav1.ConditionTrue
338339
progressingCond.Reason = ocv1.ReasonRetrying
339-
progressingCond.Message = err.Error()
340+
progressingCond.Message = errorutil.SanitizeNetworkError(err)
340341
}
341342

342343
if errors.Is(err, reconcile.TerminalError(nil)) {

internal/operator-controller/controllers/common_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ func setStatusProgressing(ext *ocv1.ClusterExtension, err error) {
157157
if err != nil {
158158
progressingCond.Reason = ocv1.ReasonRetrying
159159
// Unwrap TerminalError to avoid "terminal error:" prefix in message
160-
progressingCond.Message = errorutil.UnwrapTerminal(err).Error()
160+
progressingCond.Message = errorutil.SanitizeNetworkError(errorutil.UnwrapTerminal(err))
161161
}
162162

163163
if errors.Is(err, reconcile.TerminalError(nil)) {

internal/operator-controller/controllers/common_controller_test.go

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package controllers
33
import (
44
"errors"
55
"fmt"
6+
"net"
67
"strings"
78
"testing"
89

@@ -68,6 +69,73 @@ func TestSetStatusProgressing(t *testing.T) {
6869
Message: "missing required field",
6970
},
7071
},
72+
{
73+
name: "non-nil ClusterExtension, non-terminal network error with source port, Progressing condition message has source port stripped",
74+
err: fmt.Errorf("source catalog content: %w", &net.OpError{
75+
Op: "dial",
76+
Net: "tcp",
77+
Source: &net.TCPAddr{
78+
IP: net.ParseIP("10.0.0.1"),
79+
Port: 52341,
80+
},
81+
Addr: &net.TCPAddr{
82+
IP: net.ParseIP("192.168.1.100"),
83+
Port: 443,
84+
},
85+
Err: fmt.Errorf("connect: connection refused"),
86+
}),
87+
clusterExtension: &ocv1.ClusterExtension{},
88+
expected: metav1.Condition{
89+
Type: ocv1.TypeProgressing,
90+
Status: metav1.ConditionTrue,
91+
Reason: ocv1.ReasonRetrying,
92+
Message: "dial tcp: connect: connection refused",
93+
},
94+
},
95+
{
96+
name: "non-nil ClusterExtension, non-terminal DNS error, Progressing condition message is sanitized",
97+
err: fmt.Errorf("source catalog content: %w", &net.OpError{
98+
Op: "dial",
99+
Net: "tcp",
100+
Err: &net.DNSError{
101+
IsTemporary: true,
102+
IsTimeout: true,
103+
Server: "10.96.0.10:53",
104+
Name: "registry.example.com",
105+
Err: "read udp 10.244.0.8:46753->10.96.0.10:53",
106+
},
107+
}),
108+
clusterExtension: &ocv1.ClusterExtension{},
109+
expected: metav1.Condition{
110+
Type: ocv1.TypeProgressing,
111+
Status: metav1.ConditionTrue,
112+
Reason: ocv1.ReasonRetrying,
113+
Message: "lookup registry.example.com on 10.96.0.10:53: i/o timeout",
114+
},
115+
},
116+
{
117+
name: "non-nil ClusterExtension, terminal network error, Progressing condition message has source port stripped",
118+
err: reconcile.TerminalError(fmt.Errorf("source catalog content: %w", &net.OpError{
119+
Op: "dial",
120+
Net: "tcp",
121+
Source: &net.TCPAddr{
122+
IP: net.ParseIP("10.0.0.1"),
123+
Port: 52341,
124+
},
125+
Addr: &net.TCPAddr{
126+
IP: net.ParseIP("192.168.1.100"),
127+
Port: 443,
128+
},
129+
Err: fmt.Errorf("connect: connection refused"),
130+
})),
131+
clusterExtension: &ocv1.ClusterExtension{},
132+
expected: metav1.Condition{
133+
Type: ocv1.TypeProgressing,
134+
Status: metav1.ConditionFalse,
135+
Reason: ocv1.ReasonBlocked,
136+
Message: "dial tcp: connect: connection refused",
137+
},
138+
},
71139
} {
72140
t.Run(tc.name, func(t *testing.T) {
73141
setStatusProgressing(tc.clusterExtension, tc.err)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package error
2+
3+
import (
4+
"regexp"
5+
)
6+
7+
var ephemeralNetworkErrorPattern = regexp.MustCompile(`(read|write) (tcp|udp) ((?:[0-9]{1,3}(?:\.[0-9]{1,3}){3}|\[[0-9a-fA-F:]+\])(?::\d+)?)->((?:[0-9]{1,3}(?:\.[0-9]{1,3}){3}|\[[0-9a-fA-F:]+\])(?::\d+)?)(: )?`)
8+
9+
// SanitizeNetworkError returns a stable, deterministic error message for network
10+
// errors by stripping ephemeral details (such as source and destination addresses
11+
// and ports) from low-level socket operations. This ensures consistent error
12+
// messages across retries, preventing unnecessary status condition updates when
13+
// the only change is an ephemeral source port.
14+
//
15+
// The function uses a regex to remove substrings matching the pattern
16+
// "read/write tcp/udp <src>-><dst>: " (with IPv4 or IPv6 addresses), which
17+
// commonly appear inside [net.DNSError] Err fields (e.g.,
18+
// "read udp 10.244.0.8:46753->10.96.0.10:53: i/o timeout").
19+
//
20+
// Returns "" for nil errors, or the sanitized error string otherwise.
21+
func SanitizeNetworkError(err error) string {
22+
if err == nil {
23+
return ""
24+
}
25+
return ephemeralNetworkErrorPattern.ReplaceAllString(err.Error(), "")
26+
}
Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
package error
2+
3+
import (
4+
"fmt"
5+
"net"
6+
"testing"
7+
8+
"github.com/stretchr/testify/assert"
9+
)
10+
11+
func TestSanitizeNetworkError(t *testing.T) {
12+
for _, tc := range []struct {
13+
name string
14+
err error
15+
expected string
16+
}{
17+
{
18+
name: "nil error returns empty string",
19+
err: nil,
20+
expected: "",
21+
},
22+
{
23+
name: "non-network error returns original message",
24+
err: fmt.Errorf("some random error"),
25+
expected: "some random error",
26+
},
27+
{
28+
name: "strips read udp address pattern from DNS inner error",
29+
err: &net.OpError{
30+
Op: "dial",
31+
Net: "tcp",
32+
Err: &net.DNSError{
33+
IsNotFound: false,
34+
IsTemporary: true,
35+
IsTimeout: true,
36+
Server: "10.96.0.10:53",
37+
Name: "docker-registry.operator-controller.svc",
38+
Err: "read udp 10.244.0.8:46753->10.96.0.10:53: i/o timeout",
39+
},
40+
},
41+
expected: "dial tcp: lookup docker-registry.operator-controller.svc on 10.96.0.10:53: i/o timeout",
42+
},
43+
{
44+
name: "strips read udp pattern from wrapped error preserving outer context",
45+
err: fmt.Errorf("source catalog content: error creating image source: %w", &net.OpError{
46+
Op: "dial",
47+
Net: "tcp",
48+
Err: &net.DNSError{
49+
IsNotFound: false,
50+
IsTemporary: true,
51+
IsTimeout: true,
52+
Server: "10.96.0.10:53",
53+
Name: "docker-registry.operator-controller.svc",
54+
Err: "read udp 10.244.0.8:46753->10.96.0.10:53: i/o timeout",
55+
},
56+
}),
57+
expected: "source catalog content: error creating image source: dial tcp: lookup docker-registry.operator-controller.svc on 10.96.0.10:53: i/o timeout",
58+
},
59+
{
60+
name: "net.DNSError with IsNotFound unchanged",
61+
err: &net.DNSError{
62+
IsNotFound: true,
63+
Server: "10.96.0.10:53",
64+
Name: "registry.example.com",
65+
Err: "no such host",
66+
},
67+
expected: "lookup registry.example.com on 10.96.0.10:53: no such host",
68+
},
69+
{
70+
name: "net.DNSError without server unchanged",
71+
err: &net.DNSError{
72+
IsNotFound: true,
73+
Name: "registry.example.com",
74+
Err: "no such host",
75+
},
76+
expected: "lookup registry.example.com: no such host",
77+
},
78+
{
79+
name: "net.DNSError without read/write pattern unchanged",
80+
err: &net.DNSError{
81+
Server: "10.96.0.10:53",
82+
Name: "registry.example.com",
83+
Err: "server misbehaving",
84+
},
85+
expected: "lookup registry.example.com on 10.96.0.10:53: server misbehaving",
86+
},
87+
{
88+
name: "wrapped net.DNSError without read/write pattern preserves wrapping",
89+
err: fmt.Errorf("source catalog content: %w", &net.DNSError{
90+
Server: "10.96.0.10:53",
91+
Name: "registry.example.com",
92+
Err: "server misbehaving",
93+
}),
94+
expected: "source catalog content: lookup registry.example.com on 10.96.0.10:53: server misbehaving",
95+
},
96+
{
97+
name: "net.DNSError with both IsTimeout and IsNotFound unchanged",
98+
err: &net.DNSError{
99+
IsTimeout: true,
100+
IsNotFound: true,
101+
Server: "10.96.0.10:53",
102+
Name: "registry.example.com",
103+
Err: "i/o timeout",
104+
},
105+
expected: "lookup registry.example.com on 10.96.0.10:53: i/o timeout",
106+
},
107+
{
108+
name: "net.DNSError without server unchanged",
109+
err: &net.DNSError{
110+
IsTimeout: true,
111+
Name: "registry.example.com",
112+
Err: "i/o timeout",
113+
},
114+
expected: "lookup registry.example.com: i/o timeout",
115+
},
116+
{
117+
name: "net.OpError dial with source and addr unchanged",
118+
err: &net.OpError{
119+
Op: "dial",
120+
Net: "tcp",
121+
Source: &net.TCPAddr{
122+
IP: net.ParseIP("10.0.0.1"),
123+
Port: 52341,
124+
},
125+
Addr: &net.TCPAddr{
126+
IP: net.ParseIP("192.168.1.100"),
127+
Port: 443,
128+
},
129+
Err: fmt.Errorf("connect: connection refused"),
130+
},
131+
expected: "dial tcp 10.0.0.1:52341->192.168.1.100:443: connect: connection refused",
132+
},
133+
{
134+
name: "net.OpError dial without source unchanged",
135+
err: &net.OpError{
136+
Op: "dial",
137+
Net: "tcp",
138+
Addr: &net.TCPAddr{
139+
IP: net.ParseIP("192.168.1.100"),
140+
Port: 443,
141+
},
142+
Err: fmt.Errorf("connect: connection refused"),
143+
},
144+
expected: "dial tcp 192.168.1.100:443: connect: connection refused",
145+
},
146+
{
147+
name: "wrapped net.OpError dial preserves full error string",
148+
err: fmt.Errorf("source catalog content: error creating image source: %w", &net.OpError{
149+
Op: "dial",
150+
Net: "tcp",
151+
Source: &net.TCPAddr{
152+
IP: net.ParseIP("10.0.0.1"),
153+
Port: 52341,
154+
},
155+
Addr: &net.TCPAddr{
156+
IP: net.ParseIP("192.168.1.100"),
157+
Port: 443,
158+
},
159+
Err: fmt.Errorf("connect: connection refused"),
160+
}),
161+
expected: "source catalog content: error creating image source: dial tcp 10.0.0.1:52341->192.168.1.100:443: connect: connection refused",
162+
},
163+
{
164+
name: "strips read tcp pattern",
165+
err: fmt.Errorf("read tcp 10.0.0.1:52341->192.168.1.100:443: connection reset by peer"),
166+
expected: "connection reset by peer",
167+
},
168+
{
169+
name: "strips write tcp pattern",
170+
err: fmt.Errorf("write tcp 10.0.0.1:52341->192.168.1.100:443: broken pipe"),
171+
expected: "broken pipe",
172+
},
173+
{
174+
name: "strips read udp pattern with IPv6 addresses",
175+
err: fmt.Errorf("read udp [::1]:52341->[fd00::1]:53: i/o timeout"),
176+
expected: "i/o timeout",
177+
},
178+
} {
179+
t.Run(tc.name, func(t *testing.T) {
180+
result := SanitizeNetworkError(tc.err)
181+
assert.Equal(t, tc.expected, result)
182+
})
183+
}
184+
}
185+
186+
func TestSanitizeNetworkErrorConsistency(t *testing.T) {
187+
// Simulate DNS errors with different ephemeral ports in the read udp pattern
188+
makeError := func(sourcePort int) error {
189+
return &net.OpError{
190+
Op: "dial",
191+
Net: "tcp",
192+
Err: &net.DNSError{
193+
IsTemporary: true,
194+
IsTimeout: true,
195+
Server: "10.96.0.10:53",
196+
Name: "docker-registry.operator-controller.svc",
197+
Err: fmt.Sprintf("read udp 10.244.0.8:%d->10.96.0.10:53: i/o timeout", sourcePort),
198+
},
199+
}
200+
}
201+
202+
// Different source ports should produce the same sanitized message
203+
msg1 := SanitizeNetworkError(makeError(46753))
204+
msg2 := SanitizeNetworkError(makeError(51234))
205+
msg3 := SanitizeNetworkError(makeError(60000))
206+
207+
assert.Equal(t, msg1, msg2, "sanitized messages with different source ports should be equal")
208+
assert.Equal(t, msg2, msg3, "sanitized messages with different source ports should be equal")
209+
assert.Equal(t, "dial tcp: lookup docker-registry.operator-controller.svc on 10.96.0.10:53: i/o timeout", msg1)
210+
}
211+
212+
func TestSanitizeNetworkErrorDNSConsistency(t *testing.T) {
213+
// Simulate DNS errors with different ephemeral ports wrapped in context
214+
makeError := func(sourcePort int) error {
215+
return fmt.Errorf("source catalog content: error creating image source: %w", &net.OpError{
216+
Op: "dial",
217+
Net: "tcp",
218+
Err: &net.DNSError{
219+
IsTemporary: true,
220+
IsTimeout: true,
221+
Server: "10.96.0.10:53",
222+
Name: "registry.example.com",
223+
Err: fmt.Sprintf("read udp 10.244.0.8:%d->10.96.0.10:53: i/o timeout", sourcePort),
224+
},
225+
})
226+
}
227+
228+
msg1 := SanitizeNetworkError(makeError(46753))
229+
msg2 := SanitizeNetworkError(makeError(51234))
230+
msg3 := SanitizeNetworkError(makeError(60000))
231+
232+
assert.Equal(t, msg1, msg2, "sanitized DNS messages with different source ports should be equal")
233+
assert.Equal(t, msg2, msg3, "sanitized DNS messages with different source ports should be equal")
234+
assert.Equal(t, "source catalog content: error creating image source: dial tcp: lookup registry.example.com on 10.96.0.10:53: i/o timeout", msg1)
235+
}

0 commit comments

Comments
 (0)