@@ -21,9 +21,12 @@ import (
2121 "encoding/json"
2222 "errors"
2323 "fmt"
24+ "math"
2425 "net/url"
2526 "slices"
27+ "strconv"
2628 "strings"
29+ "time"
2730
2831 csiaddonsv1alpha1 "github.com/csi-addons/kubernetes-csi-addons/api/csiaddons/v1alpha1"
2932 "github.com/csi-addons/kubernetes-csi-addons/internal/connection"
@@ -42,7 +45,21 @@ import (
4245 "sigs.k8s.io/controller-runtime/pkg/predicate"
4346)
4447
48+ const (
49+ // The base duration to use for exponential backoff while retrying connection
50+ baseRetryDelay = 2 * time .Second
51+
52+ // Maximum attemps to make while retrying the connection
53+ maxRetries = 3
54+
55+ // The duration after which a new reconcile should be triggered
56+ // to validate the cluster state. Used only when reconciliation
57+ // completes without any errors.
58+ baseRequeueAfter = 1 * time .Hour
59+ )
60+
4561var (
62+ connRetryAnnotation = "csiaddonsnode" + csiaddonsv1alpha1 .GroupVersion .Group + "/connection-retries"
4663 csiAddonsNodeFinalizer = csiaddonsv1alpha1 .GroupVersion .Group + "/csiaddonsnode"
4764)
4865
@@ -128,20 +145,64 @@ func (r *CSIAddonsNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques
128145
129146 logger .Info ("Connecting to sidecar" )
130147 newConn , err := connection .NewConnection (ctx , endPoint , nodeID , driverName , csiAddonsNode .Namespace , csiAddonsNode .Name , r .EnableAuth )
148+
149+ // If error occurrs, we retry with exponential backoff
150+ // Store the retry transient state in annotation and also update the status
151+ // of the object with State = Retrying and Message = Retrying currAttempt / maxAttempts
152+ // Requeue with the calcuated backoff and if we reach max retries, delete the object for a clean slate.
153+ // Ensure we clean rest of the transient annotations and statuses on success
131154 if err != nil {
132- logger .Error (err , "Failed to establish connection with sidecar" )
155+ currentRetries := getRetryCountFromAnnotation (csiAddonsNode )
156+ logger .Error (err , "Failed to establish connection with sidecar" , "attempt" , currentRetries + 1 )
157+
158+ // If reached max retries, abort and cleanup
159+ if currentRetries >= maxRetries {
160+ logger .Info (fmt .Sprintf ("Failed to establish connection with sidecar after %d attempts, deleting the object" , maxRetries ))
161+
162+ if delErr := r .Delete (ctx , csiAddonsNode ); client .IgnoreNotFound (delErr ) != nil {
163+ logger .Error (delErr , "failed to delete CSIAddonsNode object after max retries" )
164+
165+ return ctrl.Result {}, delErr
166+ }
167+
168+ // Object is deleted, stop the reconcile phase
169+ logger .Info ("successfully deleted CSIAddonsNode object due to reaching max reconnection attempts" )
170+ return ctrl.Result {}, nil
171+ }
133172
134173 errMessage := util .GetErrorMessage (err )
135- csiAddonsNode .Status .State = csiaddonsv1alpha1 .CSIAddonsNodeStateFailed
136- csiAddonsNode .Status .Message = fmt .Sprintf ("Failed to establish connection with sidecar: %v " , errMessage )
174+ csiAddonsNode .Status .State = csiaddonsv1alpha1 .CSIAddonsNodeStateRetrying
175+ csiAddonsNode .Status .Message = fmt .Sprintf ("Connection failed: %v. Retrying attempt %d/%d. " , errMessage , currentRetries + 1 , maxRetries )
137176 statusErr := r .Status ().Update (ctx , csiAddonsNode )
138177 if statusErr != nil {
139178 logger .Error (statusErr , "Failed to update status" )
140179
141180 return ctrl.Result {}, statusErr
142181 }
143182
144- return ctrl.Result {}, err
183+ // Internal counter for connection attempts
184+ if csiAddonsNode .Annotations == nil {
185+ csiAddonsNode .Annotations = make (map [string ]string )
186+ }
187+ csiAddonsNode .Annotations [connRetryAnnotation ] = strconv .Itoa (currentRetries + 1 )
188+
189+ if updErr := r .Update (ctx , csiAddonsNode ); updErr != nil {
190+ logger .Error (updErr , "failed to update retry annotation" )
191+
192+ return ctrl.Result {}, updErr
193+ }
194+
195+ // Calculate backoff; baseRetryDelay * 1, 2, 4....
196+ backoff := baseRetryDelay * time .Duration (math .Pow (2 , float64 (currentRetries )))
197+ logger .Info ("Requeuing request for attempting the connection again" , "backoff" , backoff )
198+
199+ return ctrl.Result {RequeueAfter : backoff }, nil
200+ }
201+
202+ // Success path, cleanup the retry artifacts
203+ needsCleanup := false
204+ if _ , ok := csiAddonsNode .GetAnnotations ()[connRetryAnnotation ]; ok {
205+ needsCleanup = true
145206 }
146207
147208 nfsc , err := r .getNetworkFenceClientStatus (ctx , & logger , newConn , csiAddonsNode )
@@ -165,7 +226,18 @@ func (r *CSIAddonsNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques
165226 return ctrl.Result {}, err
166227 }
167228
168- return ctrl.Result {}, nil
229+ if needsCleanup {
230+ // Clean annotations
231+ delete (csiAddonsNode .Annotations , connRetryAnnotation )
232+ if updErr := r .Update (ctx , csiAddonsNode ); updErr != nil {
233+ logger .Error (updErr , "failed to clean the retry annotation" )
234+
235+ return ctrl.Result {}, updErr
236+ }
237+ }
238+
239+ // Reconciled successfully, requeue to validate state periodically
240+ return ctrl.Result {RequeueAfter : baseRequeueAfter }, nil
169241}
170242
171243// getNetworkFenceClassesForDriver gets the networkfenceclasses for the driver.
@@ -416,3 +488,12 @@ func parseCapabilities(caps []*identity.Capability) []string {
416488
417489 return capabilities
418490}
491+
492+ func getRetryCountFromAnnotation (node * csiaddonsv1alpha1.CSIAddonsNode ) int {
493+ if val , ok := node .GetAnnotations ()[connRetryAnnotation ]; ok {
494+ if count , err := strconv .Atoi (val ); err == nil {
495+ return count
496+ }
497+ }
498+ return 0
499+ }
0 commit comments