@@ -21,9 +21,12 @@ import (
2121 "encoding/json"
2222 "errors"
2323 "fmt"
24+ "math"
2425 "net/url"
2526 "slices"
27+ "strconv"
2628 "strings"
29+ "time"
2730
2831 csiaddonsv1alpha1 "github.com/csi-addons/kubernetes-csi-addons/api/csiaddons/v1alpha1"
2932 "github.com/csi-addons/kubernetes-csi-addons/internal/connection"
@@ -42,7 +45,21 @@ import (
4245 "sigs.k8s.io/controller-runtime/pkg/predicate"
4346)
4447
48+ const (
49+ // The base duration to use for exponential backoff while retrying connection
50+ baseRetryDelay = 2 * time .Second
51+
52+ // Maximum attempts to make while retrying the connection
53+ maxRetries = 3
54+
55+ // The duration after which a new reconcile should be triggered
56+ // to validate the cluster state. Used only when reconciliation
57+ // completes without any errors.
58+ baseRequeueAfter = 1 * time .Hour
59+ )
60+
4561var (
62+ connRetryAnnotation = "csiaddonsnode" + csiaddonsv1alpha1 .GroupVersion .Group + "/connection-retries"
4663 csiAddonsNodeFinalizer = csiaddonsv1alpha1 .GroupVersion .Group + "/csiaddonsnode"
4764)
4865
@@ -128,20 +145,61 @@ func (r *CSIAddonsNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques
128145
129146 logger .Info ("Connecting to sidecar" )
130147 newConn , err := connection .NewConnection (ctx , endPoint , nodeID , driverName , csiAddonsNode .Namespace , csiAddonsNode .Name , r .EnableAuth )
148+
149+ // If error occurs, we retry with exponential backoff until we reach `maxRetries`
150+ // Store the transient state in annotation and also update the status to reflect the same
131151 if err != nil {
132- logger .Error (err , "Failed to establish connection with sidecar" )
152+ currentRetries := getRetryCountFromAnnotation (csiAddonsNode )
153+ logger .Error (err , "Failed to establish connection with sidecar" , "attempt" , currentRetries + 1 )
154+
155+ // If reached max retries, abort and cleanup
156+ if currentRetries >= maxRetries {
157+ logger .Info (fmt .Sprintf ("Failed to establish connection with sidecar after %d attempts, deleting the object" , maxRetries ))
158+
159+ if delErr := r .Delete (ctx , csiAddonsNode ); client .IgnoreNotFound (delErr ) != nil {
160+ logger .Error (delErr , "failed to delete CSIAddonsNode object after max retries" )
161+
162+ return ctrl.Result {}, delErr
163+ }
164+
165+ // Object is deleted, stop the reconcile phase
166+ logger .Info ("successfully deleted CSIAddonsNode object due to reaching max reconnection attempts" )
167+ return ctrl.Result {}, nil
168+ }
133169
134170 errMessage := util .GetErrorMessage (err )
135- csiAddonsNode .Status .State = csiaddonsv1alpha1 .CSIAddonsNodeStateFailed
136- csiAddonsNode .Status .Message = fmt .Sprintf ("Failed to establish connection with sidecar: %v " , errMessage )
171+ csiAddonsNode .Status .State = csiaddonsv1alpha1 .CSIAddonsNodeStateRetrying
172+ csiAddonsNode .Status .Message = fmt .Sprintf ("Connection failed: %v. Retrying attempt %d/%d. " , errMessage , currentRetries + 1 , maxRetries )
137173 statusErr := r .Status ().Update (ctx , csiAddonsNode )
138174 if statusErr != nil {
139175 logger .Error (statusErr , "Failed to update status" )
140176
141177 return ctrl.Result {}, statusErr
142178 }
143179
144- return ctrl.Result {}, err
180+ // Internal counter for connection attempts
181+ if csiAddonsNode .Annotations == nil {
182+ csiAddonsNode .Annotations = make (map [string ]string )
183+ }
184+ csiAddonsNode .Annotations [connRetryAnnotation ] = strconv .Itoa (currentRetries + 1 )
185+
186+ if updErr := r .Update (ctx , csiAddonsNode ); updErr != nil {
187+ logger .Error (updErr , "failed to update retry annotation" )
188+
189+ return ctrl.Result {}, updErr
190+ }
191+
192+ // Calculate backoff; baseRetryDelay * 1, 2, 4....
193+ backoff := baseRetryDelay * time .Duration (math .Pow (2 , float64 (currentRetries )))
194+ logger .Info ("Requeuing request for attempting the connection again" , "backoff" , backoff )
195+
196+ return ctrl.Result {RequeueAfter : backoff }, nil
197+ }
198+
199+ // Success path, cleanup the retry artifacts
200+ needsCleanup := false
201+ if _ , ok := csiAddonsNode .GetAnnotations ()[connRetryAnnotation ]; ok {
202+ needsCleanup = true
145203 }
146204
147205 nfsc , err := r .getNetworkFenceClientStatus (ctx , & logger , newConn , csiAddonsNode )
@@ -165,7 +223,18 @@ func (r *CSIAddonsNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques
165223 return ctrl.Result {}, err
166224 }
167225
168- return ctrl.Result {}, nil
226+ if needsCleanup {
227+ // Clean annotations
228+ delete (csiAddonsNode .Annotations , connRetryAnnotation )
229+ if updErr := r .Update (ctx , csiAddonsNode ); updErr != nil {
230+ logger .Error (updErr , "failed to clean the retry annotation" )
231+
232+ return ctrl.Result {}, updErr
233+ }
234+ }
235+
236+ // Reconciled successfully, requeue to validate state periodically
237+ return ctrl.Result {RequeueAfter : baseRequeueAfter }, nil
169238}
170239
171240// getNetworkFenceClassesForDriver gets the networkfenceclasses for the driver.
@@ -416,3 +485,12 @@ func parseCapabilities(caps []*identity.Capability) []string {
416485
417486 return capabilities
418487}
488+
489+ func getRetryCountFromAnnotation (node * csiaddonsv1alpha1.CSIAddonsNode ) int {
490+ if val , ok := node .GetAnnotations ()[connRetryAnnotation ]; ok {
491+ if count , err := strconv .Atoi (val ); err == nil {
492+ return count
493+ }
494+ }
495+ return 0
496+ }
0 commit comments