Skip to content

Commit 714a22d

Browse files
csiaddonsnode: Add retry with exponential backoff for connections
This patch adds the functionality to retry for a maximum of `maxRetries` to connect to the sidecar. If the connection attempt is not successful, the object is considered obsolete and is deleted. The retry is tracked inside an annotation(`connRetryAnnotation`) and also reflected in object's status. These transient artifacts are cleaned up once a connection is established. Signed-off-by: Niraj Yadav <niryadav@redhat.com>
1 parent b4ac322 commit 714a22d

File tree

2 files changed

+89
-5
lines changed

2 files changed

+89
-5
lines changed

api/csiaddons/v1alpha1/csiaddonsnode_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ const (
2929

3030
// Failed represents the Connection Failed state.
3131
CSIAddonsNodeStateFailed CSIAddonsNodeState = "Failed"
32+
33+
// Retrying represents the Connection Retrying state.
34+
CSIAddonsNodeStateRetrying CSIAddonsNodeState = "Retrying"
3235
)
3336

3437
type CSIAddonsNodeDriver struct {

internal/controller/csiaddons/csiaddonsnode_controller.go

Lines changed: 86 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,12 @@ import (
2121
"encoding/json"
2222
"errors"
2323
"fmt"
24+
"math"
2425
"net/url"
2526
"slices"
27+
"strconv"
2628
"strings"
29+
"time"
2730

2831
csiaddonsv1alpha1 "github.com/csi-addons/kubernetes-csi-addons/api/csiaddons/v1alpha1"
2932
"github.com/csi-addons/kubernetes-csi-addons/internal/connection"
@@ -42,7 +45,21 @@ import (
4245
"sigs.k8s.io/controller-runtime/pkg/predicate"
4346
)
4447

48+
const (
49+
// The base duration to use for exponential backoff while retrying connection
50+
baseRetryDelay = 2 * time.Second
51+
52+
// Maximum attemps to make while retrying the connection
53+
maxRetries = 3
54+
55+
// The duration after which a new reconcile should be triggered
56+
// to validate the cluster state. Used only when reconciliation
57+
// completes without any errors.
58+
baseRequeueAfter = 1 * time.Hour
59+
)
60+
4561
var (
62+
connRetryAnnotation = "csiaddonsnode" + csiaddonsv1alpha1.GroupVersion.Group + "/connection-retries"
4663
csiAddonsNodeFinalizer = csiaddonsv1alpha1.GroupVersion.Group + "/csiaddonsnode"
4764
)
4865

@@ -128,20 +145,64 @@ func (r *CSIAddonsNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques
128145

129146
logger.Info("Connecting to sidecar")
130147
newConn, err := connection.NewConnection(ctx, endPoint, nodeID, driverName, csiAddonsNode.Namespace, csiAddonsNode.Name, r.EnableAuth)
148+
149+
// If error occurrs, we retry with exponential backoff
150+
// Store the retry transient state in annotation and also update the status
151+
// of the object with State = Retrying and Message = Retrying currAttempt / maxAttempts
152+
// Requeue with the calcuated backoff and if we reach max retries, delete the object for a clean slate.
153+
// Ensure we clean rest of the transient annotations and statuses on success
131154
if err != nil {
132-
logger.Error(err, "Failed to establish connection with sidecar")
155+
currentRetries := getRetryCountFromAnnotation(csiAddonsNode)
156+
logger.Error(err, "Failed to establish connection with sidecar", "attempt", currentRetries+1)
157+
158+
// If reached max retries, abort and cleanup
159+
if currentRetries >= maxRetries {
160+
logger.Info(fmt.Sprintf("Failed to establish connection with sidecar after %d attempts, deleting the object", maxRetries))
161+
162+
if delErr := r.Delete(ctx, csiAddonsNode); client.IgnoreNotFound(delErr) != nil {
163+
logger.Error(delErr, "failed to delete CSIAddonsNode object after max retries")
164+
165+
return ctrl.Result{}, delErr
166+
}
167+
168+
// Object is deleted, stop the reconcile phase
169+
logger.Info("successfully deleted CSIAddonsNode object due to reaching max reconnection attempts")
170+
return ctrl.Result{}, nil
171+
}
133172

134173
errMessage := util.GetErrorMessage(err)
135-
csiAddonsNode.Status.State = csiaddonsv1alpha1.CSIAddonsNodeStateFailed
136-
csiAddonsNode.Status.Message = fmt.Sprintf("Failed to establish connection with sidecar: %v", errMessage)
174+
csiAddonsNode.Status.State = csiaddonsv1alpha1.CSIAddonsNodeStateRetrying
175+
csiAddonsNode.Status.Message = fmt.Sprintf("Connection failed: %v. Retrying attempt %d/%d.", errMessage, currentRetries+1, maxRetries)
137176
statusErr := r.Status().Update(ctx, csiAddonsNode)
138177
if statusErr != nil {
139178
logger.Error(statusErr, "Failed to update status")
140179

141180
return ctrl.Result{}, statusErr
142181
}
143182

144-
return ctrl.Result{}, err
183+
// Internal counter for connection attempts
184+
if csiAddonsNode.Annotations == nil {
185+
csiAddonsNode.Annotations = make(map[string]string)
186+
}
187+
csiAddonsNode.Annotations[connRetryAnnotation] = strconv.Itoa(currentRetries + 1)
188+
189+
if updErr := r.Update(ctx, csiAddonsNode); updErr != nil {
190+
logger.Error(updErr, "failed to update retry annotation")
191+
192+
return ctrl.Result{}, updErr
193+
}
194+
195+
// Calculate backoff; baseRetryDelay * 1, 2, 4....
196+
backoff := baseRetryDelay * time.Duration(math.Pow(2, float64(currentRetries)))
197+
logger.Info("Requeuing request for attempting the connection again", "backoff", backoff)
198+
199+
return ctrl.Result{RequeueAfter: backoff}, nil
200+
}
201+
202+
// Success path, cleanup the retry artifacts
203+
needsCleanup := false
204+
if _, ok := csiAddonsNode.GetAnnotations()[connRetryAnnotation]; ok {
205+
needsCleanup = true
145206
}
146207

147208
nfsc, err := r.getNetworkFenceClientStatus(ctx, &logger, newConn, csiAddonsNode)
@@ -165,7 +226,18 @@ func (r *CSIAddonsNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques
165226
return ctrl.Result{}, err
166227
}
167228

168-
return ctrl.Result{}, nil
229+
if needsCleanup {
230+
// Clean annotations
231+
delete(csiAddonsNode.Annotations, connRetryAnnotation)
232+
if updErr := r.Update(ctx, csiAddonsNode); updErr != nil {
233+
logger.Error(updErr, "failed to clean the retry annotation")
234+
235+
return ctrl.Result{}, updErr
236+
}
237+
}
238+
239+
// Reconciled successfully, requeue to validate state periodically
240+
return ctrl.Result{RequeueAfter: baseRequeueAfter}, nil
169241
}
170242

171243
// getNetworkFenceClassesForDriver gets the networkfenceclasses for the driver.
@@ -416,3 +488,12 @@ func parseCapabilities(caps []*identity.Capability) []string {
416488

417489
return capabilities
418490
}
491+
492+
func getRetryCountFromAnnotation(node *csiaddonsv1alpha1.CSIAddonsNode) int {
493+
if val, ok := node.GetAnnotations()[connRetryAnnotation]; ok {
494+
if count, err := strconv.Atoi(val); err == nil {
495+
return count
496+
}
497+
}
498+
return 0
499+
}

0 commit comments

Comments
 (0)