Skip to content

Commit 31bdf4c

Browse files
csiaddonsnode: Add retry with exponential backoff for connections
This patch adds the functionality to retry for a maximum of `maxRetries` to connect to the sidecar. If the connection attempt is not successful, the object is considered obsolete and is deleted. The retry is tracked inside an annotation(`connRetryAnnotation`) and also reflected in object's status. These transient artifacts are cleaned up once a connection is established. Signed-off-by: Niraj Yadav <niryadav@redhat.com>
1 parent b4ac322 commit 31bdf4c

File tree

2 files changed

+86
-5
lines changed

2 files changed

+86
-5
lines changed

api/csiaddons/v1alpha1/csiaddonsnode_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ const (
2929

3030
// Failed represents the Connection Failed state.
3131
CSIAddonsNodeStateFailed CSIAddonsNodeState = "Failed"
32+
33+
// Retrying represents the Connection Retrying state.
34+
CSIAddonsNodeStateRetrying CSIAddonsNodeState = "Retrying"
3235
)
3336

3437
type CSIAddonsNodeDriver struct {

internal/controller/csiaddons/csiaddonsnode_controller.go

Lines changed: 83 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,12 @@ import (
2121
"encoding/json"
2222
"errors"
2323
"fmt"
24+
"math"
2425
"net/url"
2526
"slices"
27+
"strconv"
2628
"strings"
29+
"time"
2730

2831
csiaddonsv1alpha1 "github.com/csi-addons/kubernetes-csi-addons/api/csiaddons/v1alpha1"
2932
"github.com/csi-addons/kubernetes-csi-addons/internal/connection"
@@ -42,7 +45,21 @@ import (
4245
"sigs.k8s.io/controller-runtime/pkg/predicate"
4346
)
4447

48+
const (
49+
// The base duration to use for exponential backoff while retrying connection
50+
baseRetryDelay = 2 * time.Second
51+
52+
// Maximum attempts to make while retrying the connection
53+
maxRetries = 3
54+
55+
// The duration after which a new reconcile should be triggered
56+
// to validate the cluster state. Used only when reconciliation
57+
// completes without any errors.
58+
baseRequeueAfter = 1 * time.Hour
59+
)
60+
4561
var (
62+
connRetryAnnotation = "csiaddonsnode" + csiaddonsv1alpha1.GroupVersion.Group + "/connection-retries"
4663
csiAddonsNodeFinalizer = csiaddonsv1alpha1.GroupVersion.Group + "/csiaddonsnode"
4764
)
4865

@@ -128,20 +145,61 @@ func (r *CSIAddonsNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques
128145

129146
logger.Info("Connecting to sidecar")
130147
newConn, err := connection.NewConnection(ctx, endPoint, nodeID, driverName, csiAddonsNode.Namespace, csiAddonsNode.Name, r.EnableAuth)
148+
149+
// If error occurs, we retry with exponential backoff until we reach `maxRetries`
150+
// Store the transient state in annotation and also update the status to reflect the same
131151
if err != nil {
132-
logger.Error(err, "Failed to establish connection with sidecar")
152+
currentRetries := getRetryCountFromAnnotation(csiAddonsNode)
153+
logger.Error(err, "Failed to establish connection with sidecar", "attempt", currentRetries+1)
154+
155+
// If reached max retries, abort and cleanup
156+
if currentRetries >= maxRetries {
157+
logger.Info(fmt.Sprintf("Failed to establish connection with sidecar after %d attempts, deleting the object", maxRetries))
158+
159+
if delErr := r.Delete(ctx, csiAddonsNode); client.IgnoreNotFound(delErr) != nil {
160+
logger.Error(delErr, "failed to delete CSIAddonsNode object after max retries")
161+
162+
return ctrl.Result{}, delErr
163+
}
164+
165+
// Object is deleted, stop the reconcile phase
166+
logger.Info("successfully deleted CSIAddonsNode object due to reaching max reconnection attempts")
167+
return ctrl.Result{}, nil
168+
}
133169

134170
errMessage := util.GetErrorMessage(err)
135-
csiAddonsNode.Status.State = csiaddonsv1alpha1.CSIAddonsNodeStateFailed
136-
csiAddonsNode.Status.Message = fmt.Sprintf("Failed to establish connection with sidecar: %v", errMessage)
171+
csiAddonsNode.Status.State = csiaddonsv1alpha1.CSIAddonsNodeStateRetrying
172+
csiAddonsNode.Status.Message = fmt.Sprintf("Connection failed: %v. Retrying attempt %d/%d.", errMessage, currentRetries+1, maxRetries)
137173
statusErr := r.Status().Update(ctx, csiAddonsNode)
138174
if statusErr != nil {
139175
logger.Error(statusErr, "Failed to update status")
140176

141177
return ctrl.Result{}, statusErr
142178
}
143179

144-
return ctrl.Result{}, err
180+
// Internal counter for connection attempts
181+
if csiAddonsNode.Annotations == nil {
182+
csiAddonsNode.Annotations = make(map[string]string)
183+
}
184+
csiAddonsNode.Annotations[connRetryAnnotation] = strconv.Itoa(currentRetries + 1)
185+
186+
if updErr := r.Update(ctx, csiAddonsNode); updErr != nil {
187+
logger.Error(updErr, "failed to update retry annotation")
188+
189+
return ctrl.Result{}, updErr
190+
}
191+
192+
// Calculate backoff; baseRetryDelay * 1, 2, 4....
193+
backoff := baseRetryDelay * time.Duration(math.Pow(2, float64(currentRetries)))
194+
logger.Info("Requeuing request for attempting the connection again", "backoff", backoff)
195+
196+
return ctrl.Result{RequeueAfter: backoff}, nil
197+
}
198+
199+
// Success path, cleanup the retry artifacts
200+
needsCleanup := false
201+
if _, ok := csiAddonsNode.GetAnnotations()[connRetryAnnotation]; ok {
202+
needsCleanup = true
145203
}
146204

147205
nfsc, err := r.getNetworkFenceClientStatus(ctx, &logger, newConn, csiAddonsNode)
@@ -165,7 +223,18 @@ func (r *CSIAddonsNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques
165223
return ctrl.Result{}, err
166224
}
167225

168-
return ctrl.Result{}, nil
226+
if needsCleanup {
227+
// Clean annotations
228+
delete(csiAddonsNode.Annotations, connRetryAnnotation)
229+
if updErr := r.Update(ctx, csiAddonsNode); updErr != nil {
230+
logger.Error(updErr, "failed to clean the retry annotation")
231+
232+
return ctrl.Result{}, updErr
233+
}
234+
}
235+
236+
// Reconciled successfully, requeue to validate state periodically
237+
return ctrl.Result{RequeueAfter: baseRequeueAfter}, nil
169238
}
170239

171240
// getNetworkFenceClassesForDriver gets the networkfenceclasses for the driver.
@@ -416,3 +485,12 @@ func parseCapabilities(caps []*identity.Capability) []string {
416485

417486
return capabilities
418487
}
488+
489+
func getRetryCountFromAnnotation(node *csiaddonsv1alpha1.CSIAddonsNode) int {
490+
if val, ok := node.GetAnnotations()[connRetryAnnotation]; ok {
491+
if count, err := strconv.Atoi(val); err == nil {
492+
return count
493+
}
494+
}
495+
return 0
496+
}

0 commit comments

Comments
 (0)