@@ -21,9 +21,12 @@ import (
2121 "encoding/json"
2222 "errors"
2323 "fmt"
24+ "math"
2425 "net/url"
2526 "slices"
27+ "strconv"
2628 "strings"
29+ "time"
2730
2831 csiaddonsv1alpha1 "github.com/csi-addons/kubernetes-csi-addons/api/csiaddons/v1alpha1"
2932 "github.com/csi-addons/kubernetes-csi-addons/internal/connection"
@@ -42,6 +45,19 @@ import (
4245 "sigs.k8s.io/controller-runtime/pkg/predicate"
4346)
4447
48+ const (
49+ // The base duration to use for exponential backoff while retrying connection
50+ baseRetryDelay = 2 * time .Second
51+
52+ // Maximum attempts to make while retrying the connection
53+ maxRetries = 3
54+
55+ // The duration after which a new reconcile should be triggered
56+ // to validate the cluster state. Used only when reconciliation
57+ // completes without any errors.
58+ baseRequeueAfter = 1 * time .Hour
59+ )
60+
4561var (
4662 csiAddonsNodeFinalizer = csiaddonsv1alpha1 .GroupVersion .Group + "/csiaddonsnode"
4763)
@@ -128,20 +144,50 @@ func (r *CSIAddonsNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques
128144
129145 logger .Info ("Connecting to sidecar" )
130146 newConn , err := connection .NewConnection (ctx , endPoint , nodeID , driverName , csiAddonsNode .Namespace , csiAddonsNode .Name , r .EnableAuth )
147+
148+ // If error occurs, we retry with exponential backoff until we reach `maxRetries`
149+ // Store the transient state in annotation and also update the status to reflect the same
131150 if err != nil {
132- logger .Error (err , "Failed to establish connection with sidecar" )
151+ // Only continue if we get a valid/initial retry count
152+ currentRetries , e := getRetryCountFromReason (csiAddonsNode .Status .Reason )
153+ if e != nil {
154+ logger .Error (e , "failed to get the retry count from csiAddonsNode status" , "csiAddonsNodeStatus" , csiAddonsNode .Status )
155+
156+ return ctrl.Result {}, e
157+ }
158+ logger .Error (err , "Failed to establish connection with sidecar" , "attempt" , currentRetries + 1 )
159+
160+ // If reached max retries, abort and cleanup
161+ if currentRetries >= maxRetries {
162+ logger .Info (fmt .Sprintf ("Failed to establish connection with sidecar after %d attempts, deleting the object" , maxRetries ))
163+
164+ if delErr := r .Delete (ctx , csiAddonsNode ); client .IgnoreNotFound (delErr ) != nil {
165+ logger .Error (delErr , "failed to delete CSIAddonsNode object after max retries" )
166+
167+ return ctrl.Result {}, delErr
168+ }
169+
170+ // Object is deleted, stop the reconcile phase
171+ logger .Info ("successfully deleted CSIAddonsNode object due to reaching max reconnection attempts" )
172+ return ctrl.Result {}, nil
173+ }
133174
134175 errMessage := util .GetErrorMessage (err )
135- csiAddonsNode .Status .State = csiaddonsv1alpha1 .CSIAddonsNodeStateFailed
136- csiAddonsNode .Status .Message = fmt .Sprintf ("Failed to establish connection with sidecar: %v" , errMessage )
176+ csiAddonsNode .Status .State = csiaddonsv1alpha1 .CSIAddonsNodeStateRetrying
177+ csiAddonsNode .Status .Message = fmt .Sprintf ("Connection failed: %v. Retrying attempt %d/%d." , errMessage , currentRetries + 1 , maxRetries )
178+ csiAddonsNode .Status .Reason = fmt .Sprintf (csiaddonsv1alpha1 .CSIAddonsNodeStateRetryingFmtStr , currentRetries + 1 )
137179 statusErr := r .Status ().Update (ctx , csiAddonsNode )
138180 if statusErr != nil {
139181 logger .Error (statusErr , "Failed to update status" )
140182
141183 return ctrl.Result {}, statusErr
142184 }
143185
144- return ctrl.Result {}, err
186+ // Calculate backoff; baseRetryDelay * 1, 2, 4....
187+ backoff := baseRetryDelay * time .Duration (math .Pow (2 , float64 (currentRetries )))
188+ logger .Info ("Requeuing request for attempting the connection again" , "backoff" , backoff )
189+
190+ return ctrl.Result {RequeueAfter : backoff }, nil
145191 }
146192
147193 nfsc , err := r .getNetworkFenceClientStatus (ctx , & logger , newConn , csiAddonsNode )
@@ -157,6 +203,7 @@ func (r *CSIAddonsNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques
157203
158204 csiAddonsNode .Status .State = csiaddonsv1alpha1 .CSIAddonsNodeStateConnected
159205 csiAddonsNode .Status .Message = "Successfully established connection with sidecar"
206+ csiAddonsNode .Status .Reason = ""
160207 csiAddonsNode .Status .Capabilities = parseCapabilities (newConn .Capabilities )
161208 err = r .Status ().Update (ctx , csiAddonsNode )
162209 if err != nil {
@@ -165,7 +212,8 @@ func (r *CSIAddonsNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques
165212 return ctrl.Result {}, err
166213 }
167214
168- return ctrl.Result {}, nil
215+ // Reconciled successfully, requeue to validate state periodically
216+ return ctrl.Result {RequeueAfter : baseRequeueAfter }, nil
169217}
170218
171219// getNetworkFenceClassesForDriver gets the networkfenceclasses for the driver.
@@ -341,6 +389,10 @@ func (r *CSIAddonsNodeReconciler) resolveEndpoint(ctx context.Context, rawURL st
341389 Name : podname ,
342390 }, pod )
343391 if err != nil {
392+ // do not return podname if the pod does not exist
393+ if apierrors .IsNotFound (err ) {
394+ podname = ""
395+ }
344396 return podname , "" , fmt .Errorf ("failed to get pod %s/%s: %w" , namespace , podname , err )
345397 } else if pod .Status .PodIP == "" {
346398 return podname , "" , fmt .Errorf ("pod %s/%s does not have an IP-address" , namespace , podname )
@@ -416,3 +468,26 @@ func parseCapabilities(caps []*identity.Capability) []string {
416468
417469 return capabilities
418470}
471+
472+ // getRetryCountFromReason expects a string and tries to extract
473+ // and return the retry count from the string.
474+ // If the reason string is empty, it assumes the first attempt and returns 0.
475+ // An error is returned if the parsing is not successful.
476+ func getRetryCountFromReason (reason string ) (int , error ) {
477+ // Might not be updated yet, likely the 1st attempt
478+ if reason == "" {
479+ return 0 , nil
480+ }
481+
482+ parts := strings .SplitN (reason , ":" , 2 )
483+ if len (parts ) < 2 {
484+ return 0 , errors .New ("got an unexpected length after splitting the reason string" )
485+ }
486+
487+ // Parse
488+ if c , err := strconv .Atoi (strings .TrimSpace (parts [1 ])); err == nil {
489+ return c , nil
490+ }
491+
492+ return 0 , errors .New ("failed to parse the reason string to an integer" )
493+ }
0 commit comments