Skip to content

Commit 0d36098

Browse files
committed
Merge remote-tracking branch 'origin/4.18' into 4.19
2 parents 3762439 + 69e8ebc commit 0d36098

File tree

4 files changed

+59
-57
lines changed

4 files changed

+59
-57
lines changed

plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterManagerImpl.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1956,6 +1956,7 @@ public ConfigKey<?>[] getConfigKeys() {
19561956
KubernetesClusterStartTimeout,
19571957
KubernetesClusterScaleTimeout,
19581958
KubernetesClusterUpgradeTimeout,
1959+
KubernetesClusterUpgradeRetries,
19591960
KubernetesClusterExperimentalFeaturesEnabled,
19601961
KubernetesMaxClusterSize
19611962
};

plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterService.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,12 @@ public interface KubernetesClusterService extends PluggableService, Configurable
7272
"Timeout interval (in seconds) in which upgrade operation for a Kubernetes cluster should be completed. Not strictly obeyed while upgrade is in progress on a node",
7373
true,
7474
KubernetesServiceEnabled.key());
75+
static final ConfigKey<Integer> KubernetesClusterUpgradeRetries = new ConfigKey<Integer>("Advanced", Integer.class,
76+
"cloud.kubernetes.cluster.upgrade.retries",
77+
"3",
78+
"The number of retries if fail to upgrade kubernetes cluster due to some reasons (e.g. drain node, etcdserver leader changed)",
79+
true,
80+
KubernetesServiceEnabled.key());
7581
static final ConfigKey<Boolean> KubernetesClusterExperimentalFeaturesEnabled = new ConfigKey<Boolean>("Advanced", Boolean.class,
7682
"cloud.kubernetes.cluster.experimental.features.enabled",
7783
"false",

plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java

Lines changed: 42 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -77,39 +77,62 @@ private Pair<Boolean, String> runInstallScriptOnVM(final UserVm vm, final int in
7777
}
7878

7979
private void upgradeKubernetesClusterNodes() {
80-
Pair<Boolean, String> result = null;
8180
for (int i = 0; i < clusterVMs.size(); ++i) {
8281
UserVm vm = clusterVMs.get(i);
8382
String hostName = vm.getHostName();
8483
if (StringUtils.isNotEmpty(hostName)) {
8584
hostName = hostName.toLowerCase();
8685
}
87-
result = null;
86+
Pair<Boolean, String> result;
8887
if (LOGGER.isInfoEnabled()) {
8988
LOGGER.info(String.format("Upgrading node on VM %s in Kubernetes cluster %s with Kubernetes version(%s) ID: %s",
9089
vm.getDisplayName(), kubernetesCluster.getName(), upgradeVersion.getSemanticVersion(), upgradeVersion.getUuid()));
9190
}
92-
try {
93-
result = SshHelper.sshExecute(publicIpAddress, sshPort, getControlNodeLoginUser(), sshKeyFile, null,
94-
String.format("sudo /opt/bin/kubectl drain %s --ignore-daemonsets --delete-emptydir-data", hostName),
95-
10000, 10000, 60000);
96-
} catch (Exception e) {
97-
logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e);
98-
}
99-
if (!result.first()) {
100-
logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null);
91+
String errorMessage = String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName());
92+
for (int retry = KubernetesClusterService.KubernetesClusterUpgradeRetries.value(); retry >= 0; retry--) {
93+
try {
94+
result = SshHelper.sshExecute(publicIpAddress, sshPort, getControlNodeLoginUser(), sshKeyFile, null,
95+
String.format("sudo /opt/bin/kubectl drain %s --ignore-daemonsets --delete-emptydir-data", hostName),
96+
10000, 10000, 60000);
97+
if (result.first()) {
98+
break;
99+
}
100+
if (retry > 0) {
101+
LOGGER.error(String.format("%s, retries left: %s", errorMessage, retry));
102+
} else {
103+
logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null);
104+
}
105+
} catch (Exception e) {
106+
if (retry > 0) {
107+
LOGGER.error(String.format("%s due to %s, retries left: %s", errorMessage, e, retry));
108+
} else {
109+
logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e);
110+
}
111+
}
101112
}
102113
if (System.currentTimeMillis() > upgradeTimeoutTime) {
103114
logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, upgrade action timed out", kubernetesCluster.getName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null);
104115
}
105-
try {
106-
deployProvider();
107-
result = runInstallScriptOnVM(vm, i);
108-
} catch (Exception e) {
109-
logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e);
110-
}
111-
if (!result.first()) {
112-
logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null);
116+
errorMessage = String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName());
117+
for (int retry = KubernetesClusterService.KubernetesClusterUpgradeRetries.value(); retry >= 0; retry--) {
118+
try {
119+
deployProvider();
120+
result = runInstallScriptOnVM(vm, i);
121+
if (result.first()) {
122+
break;
123+
}
124+
if (retry > 0) {
125+
LOGGER.error(String.format("%s, retries left: %s", errorMessage, retry));
126+
} else {
127+
logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null);
128+
}
129+
} catch (Exception e) {
130+
if (retry > 0) {
131+
LOGGER.error(String.format("%s due to %s, retries left: %s", errorMessage, e, retry));
132+
} else {
133+
logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e);
134+
}
135+
}
113136
}
114137
if (System.currentTimeMillis() > upgradeTimeoutTime) {
115138
logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, upgrade action timed out", kubernetesCluster.getName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null);

test/integration/smoke/test_kubernetes_clusters.py

Lines changed: 10 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -283,13 +283,15 @@ def deleteKubernetesSupportedVersion(cls, version_id):
283283
cls.apiclient.deleteKubernetesSupportedVersion(deleteKubernetesSupportedVersionCmd)
284284

285285
@classmethod
286-
def listKubernetesCluster(cls, cluster_id = None):
286+
def listKubernetesCluster(cls, cluster_id = None, cluster_name = None):
287287
listKubernetesClustersCmd = listKubernetesClusters.listKubernetesClustersCmd()
288288
listKubernetesClustersCmd.listall = True
289289
if cluster_id != None:
290290
listKubernetesClustersCmd.id = cluster_id
291+
if cluster_name != None:
292+
listKubernetesClustersCmd.name = cluster_name
291293
clusterResponse = cls.apiclient.listKubernetesClusters(listKubernetesClustersCmd)
292-
if cluster_id != None and clusterResponse != None:
294+
if (cluster_id != None or cluster_name != None) and clusterResponse != None:
293295
return clusterResponse[0]
294296
return clusterResponse
295297

@@ -528,24 +530,6 @@ def test_06_delete_kubernetes_cluster(self):
528530

529531
return
530532

531-
@attr(tags=["advanced", "smoke"], required_hardware="true")
532-
@skipTestIf("hypervisorNotSupported")
533-
def test_07_deploy_kubernetes_ha_cluster(self):
534-
"""Test to deploy a new HA Kubernetes cluster
535-
536-
# Validate the following:
537-
# 1. createKubernetesCluster should return valid info for new cluster
538-
# 2. The Cloud Database contains the valid information
539-
"""
540-
if self.setup_failed == True:
541-
self.fail("Setup incomplete")
542-
if self.default_network:
543-
self.skipTest("HA cluster on shared network requires external ip address, skipping it")
544-
global k8s_cluster
545-
k8s_cluster = self.getValidKubernetesCluster(1, 3)
546-
self.debug("HA Kubernetes cluster with ID: %s successfully deployed" % k8s_cluster.id)
547-
return
548-
549533
@attr(tags=["advanced", "smoke"], required_hardware="true")
550534
@skipTestIf("hypervisorNotSupported")
551535
def test_08_upgrade_kubernetes_ha_cluster(self):
@@ -573,24 +557,6 @@ def test_08_upgrade_kubernetes_ha_cluster(self):
573557
self.debug("Kubernetes cluster with ID: %s successfully upgraded" % k8s_cluster.id)
574558
return
575559

576-
@attr(tags=["advanced", "smoke"], required_hardware="true")
577-
@skipTestIf("hypervisorNotSupported")
578-
def test_09_delete_kubernetes_ha_cluster(self):
579-
"""Test to delete a HA Kubernetes cluster
580-
581-
# Validate the following:
582-
# 1. deleteKubernetesCluster should delete an existing HA Kubernetes cluster
583-
"""
584-
if self.setup_failed == True:
585-
self.fail("Setup incomplete")
586-
if self.default_network:
587-
self.skipTest("HA cluster on shared network requires external ip address, skipping it")
588-
global k8s_cluster
589-
k8s_cluster = self.getValidKubernetesCluster(1, 3)
590-
591-
self.debug("Deleting Kubernetes cluster with ID: %s" % k8s_cluster.id)
592-
return
593-
594560
@attr(tags=["advanced", "smoke"], required_hardware="true")
595561
@skipTestIf("hypervisorNotSupported")
596562
def test_10_vpc_tier_kubernetes_cluster(self):
@@ -818,8 +784,14 @@ def createNewKubernetesCluster(self, version, size, control_nodes) :
818784
cluster = self.createKubernetesCluster(name, version.id, size, control_nodes)
819785
self.verifyKubernetesCluster(cluster, name, version.id, size, control_nodes)
820786
except Exception as ex:
787+
cluster = self.listKubernetesCluster(cluster_name = name)
788+
if cluster != None:
789+
self.deleteKubernetesClusterAndVerify(cluster.id, False, True)
821790
self.fail("Kubernetes cluster deployment failed: %s" % ex)
822791
except AssertionError as err:
792+
cluster = self.listKubernetesCluster(cluster_name = name)
793+
if cluster != None:
794+
self.deleteKubernetesClusterAndVerify(cluster.id, False, True)
823795
self.fail("Kubernetes cluster deployment failed during cluster verification: %s" % err)
824796
return cluster
825797

0 commit comments

Comments
 (0)