Skip to content

Commit f8eb32a

Browse files
csiaddonsnode: Add retry with exponential backoff for connections
This patch adds the functionality to retry for a maximum of `maxRetries` to connect to the sidecar. If the connection attempt is not successful, the object is considered obsolete and is deleted. The retry is tracked inside an annotation(`connRetryAnnotation`) and also reflected in object's status. These transient artifacts are cleaned up once a connection is established. Signed-off-by: Niraj Yadav <niryadav@redhat.com>
1 parent b4ac322 commit f8eb32a

File tree

3 files changed

+148
-5
lines changed

3 files changed

+148
-5
lines changed

api/csiaddons/v1alpha1/csiaddonsnode_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ const (
2929

3030
// Failed represents the Connection Failed state.
3131
CSIAddonsNodeStateFailed CSIAddonsNodeState = "Failed"
32+
33+
// Retrying represents the Connection Retrying state.
34+
CSIAddonsNodeStateRetrying CSIAddonsNodeState = "Retrying"
35+
36+
CSIAddonsNodeStateRetryingFmtStr = "currentAttempt: %d"
3237
)
3338

3439
type CSIAddonsNodeDriver struct {

internal/controller/csiaddons/csiaddonsnode_controller.go

Lines changed: 80 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,12 @@ import (
2121
"encoding/json"
2222
"errors"
2323
"fmt"
24+
"math"
2425
"net/url"
2526
"slices"
27+
"strconv"
2628
"strings"
29+
"time"
2730

2831
csiaddonsv1alpha1 "github.com/csi-addons/kubernetes-csi-addons/api/csiaddons/v1alpha1"
2932
"github.com/csi-addons/kubernetes-csi-addons/internal/connection"
@@ -42,6 +45,19 @@ import (
4245
"sigs.k8s.io/controller-runtime/pkg/predicate"
4346
)
4447

48+
const (
49+
// The base duration to use for exponential backoff while retrying connection
50+
baseRetryDelay = 2 * time.Second
51+
52+
// Maximum attempts to make while retrying the connection
53+
maxRetries = 3
54+
55+
// The duration after which a new reconcile should be triggered
56+
// to validate the cluster state. Used only when reconciliation
57+
// completes without any errors.
58+
baseRequeueAfter = 1 * time.Hour
59+
)
60+
4561
var (
4662
csiAddonsNodeFinalizer = csiaddonsv1alpha1.GroupVersion.Group + "/csiaddonsnode"
4763
)
@@ -128,20 +144,50 @@ func (r *CSIAddonsNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques
128144

129145
logger.Info("Connecting to sidecar")
130146
newConn, err := connection.NewConnection(ctx, endPoint, nodeID, driverName, csiAddonsNode.Namespace, csiAddonsNode.Name, r.EnableAuth)
147+
148+
// If error occurs, we retry with exponential backoff until we reach `maxRetries`
149+
// Store the transient state in annotation and also update the status to reflect the same
131150
if err != nil {
132-
logger.Error(err, "Failed to establish connection with sidecar")
151+
// Only continue if we get a valid/initial retry count
152+
currentRetries, e := getRetryCountFromReason(csiAddonsNode.Status.Reason)
153+
if e != nil {
154+
logger.Error(e, "failed to get the retry count from csiAddonsNode status", "csiAddonsNodeStatus", csiAddonsNode.Status)
155+
156+
return ctrl.Result{}, e
157+
}
158+
logger.Error(err, "Failed to establish connection with sidecar", "attempt", currentRetries+1)
159+
160+
// If reached max retries, abort and cleanup
161+
if currentRetries >= maxRetries {
162+
logger.Info(fmt.Sprintf("Failed to establish connection with sidecar after %d attempts, deleting the object", maxRetries))
163+
164+
if delErr := r.Delete(ctx, csiAddonsNode); client.IgnoreNotFound(delErr) != nil {
165+
logger.Error(delErr, "failed to delete CSIAddonsNode object after max retries")
166+
167+
return ctrl.Result{}, delErr
168+
}
169+
170+
// Object is deleted, stop the reconcile phase
171+
logger.Info("successfully deleted CSIAddonsNode object due to reaching max reconnection attempts")
172+
return ctrl.Result{}, nil
173+
}
133174

134175
errMessage := util.GetErrorMessage(err)
135-
csiAddonsNode.Status.State = csiaddonsv1alpha1.CSIAddonsNodeStateFailed
136-
csiAddonsNode.Status.Message = fmt.Sprintf("Failed to establish connection with sidecar: %v", errMessage)
176+
csiAddonsNode.Status.State = csiaddonsv1alpha1.CSIAddonsNodeStateRetrying
177+
csiAddonsNode.Status.Message = fmt.Sprintf("Connection failed: %v. Retrying attempt %d/%d.", errMessage, currentRetries+1, maxRetries)
178+
csiAddonsNode.Status.Reason = fmt.Sprintf(csiaddonsv1alpha1.CSIAddonsNodeStateRetryingFmtStr, currentRetries+1)
137179
statusErr := r.Status().Update(ctx, csiAddonsNode)
138180
if statusErr != nil {
139181
logger.Error(statusErr, "Failed to update status")
140182

141183
return ctrl.Result{}, statusErr
142184
}
143185

144-
return ctrl.Result{}, err
186+
// Calculate backoff; baseRetryDelay * 1, 2, 4....
187+
backoff := baseRetryDelay * time.Duration(math.Pow(2, float64(currentRetries)))
188+
logger.Info("Requeuing request for attempting the connection again", "backoff", backoff)
189+
190+
return ctrl.Result{RequeueAfter: backoff}, nil
145191
}
146192

147193
nfsc, err := r.getNetworkFenceClientStatus(ctx, &logger, newConn, csiAddonsNode)
@@ -157,6 +203,7 @@ func (r *CSIAddonsNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques
157203

158204
csiAddonsNode.Status.State = csiaddonsv1alpha1.CSIAddonsNodeStateConnected
159205
csiAddonsNode.Status.Message = "Successfully established connection with sidecar"
206+
csiAddonsNode.Status.Reason = ""
160207
csiAddonsNode.Status.Capabilities = parseCapabilities(newConn.Capabilities)
161208
err = r.Status().Update(ctx, csiAddonsNode)
162209
if err != nil {
@@ -165,7 +212,8 @@ func (r *CSIAddonsNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques
165212
return ctrl.Result{}, err
166213
}
167214

168-
return ctrl.Result{}, nil
215+
// Reconciled successfully, requeue to validate state periodically
216+
return ctrl.Result{RequeueAfter: baseRequeueAfter}, nil
169217
}
170218

171219
// getNetworkFenceClassesForDriver gets the networkfenceclasses for the driver.
@@ -341,6 +389,10 @@ func (r *CSIAddonsNodeReconciler) resolveEndpoint(ctx context.Context, rawURL st
341389
Name: podname,
342390
}, pod)
343391
if err != nil {
392+
// do not return podname if the pod does not exist
393+
if apierrors.IsNotFound(err) {
394+
podname = ""
395+
}
344396
return podname, "", fmt.Errorf("failed to get pod %s/%s: %w", namespace, podname, err)
345397
} else if pod.Status.PodIP == "" {
346398
return podname, "", fmt.Errorf("pod %s/%s does not have an IP-address", namespace, podname)
@@ -416,3 +468,26 @@ func parseCapabilities(caps []*identity.Capability) []string {
416468

417469
return capabilities
418470
}
471+
472+
// getRetryCountFromReason expects a string and tries to extract
473+
// and return the retry count from the string.
474+
// If the reason string is empty, it assumes the first attempt and returns 0.
475+
// An error is returned if the parsing is not successful.
476+
func getRetryCountFromReason(reason string) (int, error) {
477+
// Might not be updated yet, likely the 1st attempt
478+
if reason == "" {
479+
return 0, nil
480+
}
481+
482+
parts := strings.SplitN(reason, ":", 2)
483+
if len(parts) < 2 {
484+
return 0, errors.New("got an unexpected length after splitting the reason string")
485+
}
486+
487+
// Parse
488+
if c, err := strconv.Atoi(strings.TrimSpace(parts[1])); err == nil {
489+
return c, nil
490+
}
491+
492+
return 0, errors.New("failed to parse the reason string to an integer")
493+
}

internal/controller/csiaddons/csiaddonsnode_controller_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,3 +124,66 @@ func TestParseCapabilities(t *testing.T) {
124124
})
125125
}
126126
}
127+
func TestGetRetryCountFromReason(t *testing.T) {
128+
tests := []struct {
129+
name string
130+
reason string
131+
want int
132+
wantErr bool
133+
}{
134+
{
135+
name: "empty reason",
136+
reason: "",
137+
want: 0,
138+
wantErr: false,
139+
},
140+
{
141+
name: "valid reason",
142+
reason: "retry: 2",
143+
want: 2,
144+
wantErr: false,
145+
},
146+
{
147+
name: "valid with extra spaces",
148+
reason: "retry: 5",
149+
want: 5,
150+
wantErr: false,
151+
},
152+
{
153+
name: "valid with trailing spaces",
154+
reason: "something: 10 ",
155+
want: 10,
156+
wantErr: false,
157+
},
158+
{
159+
name: "no colon",
160+
reason: "retry 3",
161+
want: 0,
162+
wantErr: true,
163+
},
164+
{
165+
name: "non-integer value",
166+
reason: "retry: abc",
167+
want: 0,
168+
wantErr: true,
169+
},
170+
{
171+
name: "multiple colons",
172+
reason: "prefix: 7:extra",
173+
want: 0,
174+
wantErr: true,
175+
},
176+
}
177+
178+
for _, tt := range tests {
179+
t.Run(tt.name, func(t *testing.T) {
180+
got, err := getRetryCountFromReason(tt.reason)
181+
if tt.wantErr {
182+
assert.Error(t, err)
183+
} else {
184+
assert.NoError(t, err)
185+
assert.Equal(t, tt.want, got)
186+
}
187+
})
188+
}
189+
}

0 commit comments

Comments
 (0)