From 0bfb1d2a9de5df9f6ecfaee212d94b47dd997b43 Mon Sep 17 00:00:00 2001 From: greatsharp Date: Sun, 12 Oct 2025 11:02:31 +0800 Subject: [PATCH] Improve the failover efficiency --- controller/cluster.go | 24 ++++++++++++++---------- store/cluster_node.go | 2 +- 2 files changed, 15 insertions(+), 11 deletions(-) mode change 100644 => 100755 store/cluster_node.go diff --git a/controller/cluster.go b/controller/cluster.go index 752ec60..8d4e70d 100755 --- a/controller/cluster.go +++ b/controller/cluster.go @@ -138,27 +138,30 @@ func (c *ClusterChecker) increaseFailureCount(shardIndex int, node store.Node) i } log := logger.Get().With( + zap.String("cluster_name", c.clusterName), zap.String("id", node.ID()), zap.Bool("is_master", node.IsMaster()), zap.String("addr", node.Addr())) - if count%c.options.maxFailureCount == 0 { + if count%c.options.maxFailureCount == 0 || count > c.options.maxFailureCount { cluster, err := c.clusterStore.GetCluster(c.ctx, c.namespace, c.clusterName) if err != nil { - log.Error("Failed to get the clusterName info", zap.Error(err)) + log.Error("Failed to get the cluster info", zap.Error(err)) return count } newMasterID, err := cluster.PromoteNewMaster(c.ctx, shardIndex, node.ID(), "") - if err == nil { - // the node is normal if it can be elected as the new master, - // because it requires the node is healthy. - c.resetFailureCount(newMasterID) - err = c.clusterStore.UpdateCluster(c.ctx, c.namespace, cluster) - } if err != nil { log.Error("Failed to promote the new master", zap.Error(err)) - } else { - log.With(zap.String("new_master_id", newMasterID)).Info("Promote the new master") + return count + } + err = c.clusterStore.UpdateCluster(c.ctx, c.namespace, cluster) + if err != nil { + log.Error("Failed to update the cluster", zap.Error(err)) + return count } + // the node is normal if it can be elected as the new master, + // because it requires the node is healthy. + c.resetFailureCount(newMasterID) + log.With(zap.String("new_master_id", newMasterID)).Info("Promote the new master") } return count } @@ -216,6 +219,7 @@ func (c *ClusterChecker) parallelProbeNodes(ctx context.Context, cluster *store. go func(shardIdx int, n store.Node) { defer wg.Done() log := logger.Get().With( + zap.String("cluster_name", c.clusterName), zap.String("id", n.ID()), zap.Bool("is_master", n.IsMaster()), zap.String("addr", n.Addr()), diff --git a/store/cluster_node.go b/store/cluster_node.go old mode 100644 new mode 100755 index 5882dff..8fcd106 --- a/store/cluster_node.go +++ b/store/cluster_node.go @@ -46,7 +46,7 @@ const ( dialTimeout = 3200 * time.Millisecond readTimeout = 3 * time.Second writeTimeout = 3 * time.Second - minIdleConns = 3 + minIdleConns = 10 ) var (