Skip to content

Commit 5501126

Browse files
ustcweizhouRakesh Venkatesh
authored andcommitted
kvm: Handle storage issue on NFS/KVM in multiple ways
1 parent cb75e8f commit 5501126

File tree

8 files changed

+352
-97
lines changed

8 files changed

+352
-97
lines changed

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,11 @@
122122
import com.cloud.utils.time.InaccurateClock;
123123
import org.apache.commons.lang3.StringUtils;
124124

125+
import static com.cloud.configuration.ConfigurationManagerImpl.KVM_HEARTBEAT_FAILURE_ACTION;
126+
import static com.cloud.configuration.ConfigurationManagerImpl.KVM_HEARTBEAT_UPDATE_MAX_RETRIES;
127+
import static com.cloud.configuration.ConfigurationManagerImpl.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP;
128+
import static com.cloud.configuration.ConfigurationManagerImpl.KVM_HEARTBEAT_UPDATE_TIMEOUT;
129+
125130
/**
126131
* Implementation of the Agent Manager. This class controls the connection to the agents.
127132
**/
@@ -1762,6 +1767,11 @@ public void processConnect(final Host host, final StartupCommand cmd, final bool
17621767
params.put(Config.RouterAggregationCommandEachTimeout.toString(), _configDao.getValue(Config.RouterAggregationCommandEachTimeout.toString()));
17631768
params.put(Config.MigrateWait.toString(), _configDao.getValue(Config.MigrateWait.toString()));
17641769

1770+
Arrays.asList(KVM_HEARTBEAT_UPDATE_MAX_RETRIES,
1771+
KVM_HEARTBEAT_UPDATE_RETRY_SLEEP,
1772+
KVM_HEARTBEAT_UPDATE_TIMEOUT,
1773+
KVM_HEARTBEAT_FAILURE_ACTION)
1774+
.forEach(c -> params.put(c, _configDao.getValue(c)));
17651775
try {
17661776
SetHostParamsCommand cmds = new SetHostParamsCommand(params);
17671777
Commands c = new Commands(cmds);

plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHABase.java

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,36 @@ public class KVMHABase {
3232
private static final Logger s_logger = Logger.getLogger(KVMHABase.class);
3333
private long _timeout = 60000; /* 1 minutes */
3434
protected static String s_heartBeatPath;
35-
protected long _heartBeatUpdateTimeout = 60000;
36-
protected long _heartBeatUpdateFreq = 60000;
37-
protected long _heartBeatUpdateMaxTries = 5;
38-
protected long _heartBeatUpdateRetrySleep = 10000;
35+
protected static long _heartBeatUpdateTimeout = 60000;
36+
protected static long _heartBeatUpdateFreq = 60000;
37+
protected static long _heartBeatUpdateMaxRetries = 5;
38+
protected static long _heartBeatUpdateRetrySleep = 10000;
39+
protected static HeartBeatAction _heartBeatFailureAction = HeartBeatAction.HARDRESET;
3940

4041
public static enum PoolType {
4142
PrimaryStorage, SecondaryStorage
4243
}
4344

45+
public enum HeartBeatAction {
46+
HARDRESET("hardreset", "-c"),
47+
DESTROYVMS("destroyvms", "-d"),
48+
STOPAGENT("stopagent", "-s");
49+
50+
String _action;
51+
String _flag;
52+
HeartBeatAction(String action, String flag) {
53+
_action = action;
54+
_flag = flag;
55+
}
56+
@Override
57+
public String toString() {
58+
return _action;
59+
}
60+
public String getFlag() {
61+
return _flag;
62+
}
63+
}
64+
4465
public static class NfsStoragePool {
4566
String _poolUUID;
4667
String _poolIp;

plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java

Lines changed: 201 additions & 70 deletions
Large diffs are not rendered by default.

plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,11 @@
188188
import org.apache.cloudstack.utils.bytescale.ByteScaleUtils;
189189
import org.libvirt.VcpuInfo;
190190

191+
import static com.cloud.configuration.ConfigurationManagerImpl.KVM_HEARTBEAT_UPDATE_MAX_RETRIES;
192+
import static com.cloud.configuration.ConfigurationManagerImpl.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP;
193+
import static com.cloud.configuration.ConfigurationManagerImpl.KVM_HEARTBEAT_UPDATE_TIMEOUT;
194+
import static com.cloud.configuration.ConfigurationManagerImpl.KVM_HEARTBEAT_FAILURE_ACTION;
195+
191196
/**
192197
* LibvirtComputingResource execute requests on the computing/routing host using
193198
* the libvirt API
@@ -1311,6 +1316,26 @@ public boolean configureHostParams(final Map<String, String> params) {
13111316
return true;
13121317
}
13131318

1319+
public void configureHeartBeatParams(final Map<String, String> params) {
1320+
Long heartBeatUpdateMaxRetries = null;
1321+
Long heartBeatUpdateRetrySleep = null;
1322+
Long heartBeatUpdateTimeout = null;
1323+
KVMHAMonitor.HeartBeatAction heartBeatFailureAction = null;
1324+
if (params.get(KVM_HEARTBEAT_UPDATE_MAX_RETRIES) != null) {
1325+
heartBeatUpdateMaxRetries = Long.parseLong(params.get(KVM_HEARTBEAT_UPDATE_MAX_RETRIES));
1326+
}
1327+
if (params.get(KVM_HEARTBEAT_UPDATE_RETRY_SLEEP) != null) {
1328+
heartBeatUpdateRetrySleep = Long.parseLong(params.get(KVM_HEARTBEAT_UPDATE_RETRY_SLEEP));
1329+
}
1330+
if (params.get(KVM_HEARTBEAT_UPDATE_TIMEOUT) != null) {
1331+
heartBeatUpdateTimeout = Long.parseLong(params.get(KVM_HEARTBEAT_UPDATE_TIMEOUT));
1332+
}
1333+
if (params.get(KVM_HEARTBEAT_FAILURE_ACTION) != null) {
1334+
heartBeatFailureAction = KVMHAMonitor.HeartBeatAction.valueOf(params.get(KVM_HEARTBEAT_FAILURE_ACTION).toUpperCase());
1335+
}
1336+
KVMHAMonitor.configureHeartBeatParams(heartBeatUpdateMaxRetries, heartBeatUpdateRetrySleep, heartBeatUpdateTimeout, heartBeatFailureAction);
1337+
}
1338+
13141339
private void configureAgentHooks(final Map<String, Object> params) {
13151340
String value = (String) params.get("agent.hooks.basedir");
13161341
if (null != value) {

plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtVMDef.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ public class LibvirtVMDef {
3838
private String _platformEmulator;
3939
private final Map<String, Object> components = new HashMap<String, Object>();
4040

41+
public static final String MANUFACTURER_APACHE = "Apache Software Foundation";
42+
4143
public static class GuestDef {
4244
enum GuestType {
4345
KVM, XEN, EXE, LXC
@@ -170,7 +172,7 @@ public String toString() {
170172

171173
guestDef.append("<sysinfo type='smbios'>\n");
172174
guestDef.append("<system>\n");
173-
guestDef.append("<entry name='manufacturer'>Apache Software Foundation</entry>\n");
175+
guestDef.append("<entry name='manufacturer'>" + MANUFACTURER_APACHE + "</entry>\n");
174176
guestDef.append("<entry name='product'>CloudStack " + _type.toString() + " Hypervisor</entry>\n");
175177
guestDef.append("<entry name='uuid'>" + _uuid + "</entry>\n");
176178
guestDef.append("</system>\n");

plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtSetHostParamsCommandWrapper.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ public Answer execute(final SetHostParamsCommand command, final LibvirtComputing
3636
final Map<String, String> params = command.getParams();
3737
boolean success = libvirtComputingResource.getVirtRouterResource().configureHostParams(params);
3838
success = success && libvirtComputingResource.configureHostParams(params);
39+
libvirtComputingResource.configureHeartBeatParams(params);
3940

4041
if (!success) {
4142
return new Answer(command, false, "Failed to set host parameters");

scripts/vm/hypervisor/kvm/kvmheartbeat.sh

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ help() {
2424
-h host
2525
-r write/read hb log
2626
-c cleanup
27+
-d destroy vms on mount point
28+
-s stop cloudstack-agent
2729
-t interval between read hb log\n"
2830
exit 1
2931
}
@@ -35,8 +37,10 @@ HostIP=
3537
interval=
3638
rflag=0
3739
cflag=0
40+
dflag=0
41+
sflag=0
3842

39-
while getopts 'i:p:m:h:t:rc' OPTION
43+
while getopts 'i:p:m:h:t:rcds' OPTION
4044
do
4145
case $OPTION in
4246
i)
@@ -58,7 +62,13 @@ do
5862
interval="$OPTARG"
5963
;;
6064
c)
61-
cflag=1
65+
cflag=1
66+
;;
67+
d)
68+
dflag=1
69+
;;
70+
s)
71+
sflag=1
6272
;;
6373
*)
6474
help
@@ -88,13 +98,15 @@ deleteVMs() {
8898

8999
for pid in $vmPids
90100
do
101+
vmname=$(ps a -q $pid |tail -n1 |awk '{print $8}')
91102
kill -9 $pid &> /dev/null
103+
/usr/bin/logger -t heartbeat "Killed vm $vmname with pid $pid"
92104
done
93105
}
94106

95107
#checking is there the same nfs server mounted under $MountPoint?
96108
mounts=$(cat /proc/mounts |grep nfs|grep $MountPoint)
97-
if [ $? -gt 0 ]
109+
if [ $? -gt 0 ] && [ "$cflag" == "0" ] && [ "$dflag" == "0" ] && [ "$sflag" == "0" ]
98110
then
99111
# remount it
100112
mount $NfsSvrIP:$NfsSvrPath $MountPoint -o sync,soft,proto=tcp,acregmin=0,acregmax=0,acdirmin=0,acdirmax=0,noac,timeo=133,retrans=10 &> /dev/null
@@ -161,6 +173,20 @@ then
161173
sleep 5
162174
echo b > /proc/sysrq-trigger
163175
exit $?
176+
elif [ "$dflag" == "1" ]
177+
then
178+
/usr/bin/logger -t heartbeat "kvmheartbeat.sh will destroy vms on mount point $MountPoint because it was unable to write the heartbeat to the storage."
179+
sync &
180+
sleep 5
181+
deleteVMs $MountPoint
182+
exit $?
183+
elif [ "$sflag" == "1" ]
184+
then
185+
/usr/bin/logger -t heartbeat "kvmheartbeat.sh will stop cloudstack-agent because it was unable to write the heartbeat to the storage."
186+
sync &
187+
sleep 5
188+
service cloudstack-agent stop
189+
exit $?
164190
else
165191
write_hbLog
166192
exit $?

server/src/main/java/com/cloud/configuration/ConfigurationManagerImpl.java

Lines changed: 58 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import javax.inject.Inject;
4242
import javax.naming.ConfigurationException;
4343

44+
import com.google.common.base.Strings;
4445
import com.googlecode.ipv6.IPv6Address;
4546
import org.apache.cloudstack.acl.SecurityChecker;
4647
import org.apache.cloudstack.affinity.AffinityGroup;
@@ -90,7 +91,6 @@
9091
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
9192
import org.apache.cloudstack.framework.config.impl.ConfigurationVO;
9293
import org.apache.cloudstack.framework.messagebus.MessageBus;
93-
import org.apache.cloudstack.framework.messagebus.MessageSubscriber;
9494
import org.apache.cloudstack.framework.messagebus.PublishScope;
9595
import org.apache.cloudstack.query.QueryService;
9696
import org.apache.cloudstack.region.PortableIp;
@@ -425,6 +425,11 @@ public class ConfigurationManagerImpl extends ManagerBase implements Configurati
425425
private Set<String> overprovisioningFactorsForValidation;
426426
public static final String VM_USERDATA_MAX_LENGTH_STRING = "vm.userdata.max.length";
427427

428+
public static final String KVM_HEARTBEAT_UPDATE_MAX_RETRIES = "kvm.heartbeat.update.max.retries";
429+
public static final String KVM_HEARTBEAT_UPDATE_RETRY_SLEEP = "kvm.heartbeat.update.retry.sleep";
430+
public static final String KVM_HEARTBEAT_UPDATE_TIMEOUT = "kvm.heartbeat.update.timeout";
431+
public static final String KVM_HEARTBEAT_FAILURE_ACTION = "kvm.heartbeat.failure.action";
432+
428433
public static final ConfigKey<Boolean> SystemVMUseLocalStorage = new ConfigKey<Boolean>(Boolean.class, "system.vm.use.local.storage", "Advanced", "false",
429434
"Indicates whether to use local storage pools or shared storage pools for system VMs.", false, ConfigKey.Scope.Zone, null);
430435

@@ -459,6 +464,20 @@ public class ConfigurationManagerImpl extends ManagerBase implements Configurati
459464
public static final ConfigKey<Boolean> MIGRATE_VM_ACROSS_CLUSTERS = new ConfigKey<Boolean>(Boolean.class, "migrate.vm.across.clusters", "Advanced", "false",
460465
"Indicates whether the VM can be migrated to different cluster if no host is found in same cluster",true, ConfigKey.Scope.Zone, null);
461466

467+
public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_MAX_RETRIES_CK = new ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_MAX_RETRIES, "5",
468+
"The maximum retries of kvm heartbeat to write to storage",
469+
true, ConfigKey.Scope.Global);
470+
471+
public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_RETRY_SLEEP_CK = new ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_RETRY_SLEEP, "10000",
472+
"The sleep time, in milliseconds, between two kvm heartbeats to write to storage",
473+
true, ConfigKey.Scope.Global);
474+
public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_TIMEOUT_CK = new ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_TIMEOUT, "60000",
475+
"Timeout(in milliseconds) that kvm heartbeat to write to storage",
476+
true, ConfigKey.Scope.Global);
477+
public static final ConfigKey<String> KVM_HEARTBEAT_FAILURE_ACTION_CK = new ConfigKey<>("Advanced", String.class, KVM_HEARTBEAT_FAILURE_ACTION, "hardreset",
478+
"The action for heartbeat write failures on KVM host. The valid value are 'hardreset' (default), 'stopagent', 'destroyvms'",
479+
true, ConfigKey.Scope.Global);
480+
462481
private static final String IOPS_READ_RATE = "IOPS Read";
463482
private static final String IOPS_WRITE_RATE = "IOPS Write";
464483
private static final String BYTES_READ_RATE = "Bytes Read";
@@ -514,6 +533,9 @@ private void populateConfigValuesForValidationSet() {
514533
configValuesForValidation.add(StorageManager.STORAGE_POOL_CLIENT_TIMEOUT.key());
515534
configValuesForValidation.add(StorageManager.STORAGE_POOL_CLIENT_MAX_CONNECTIONS.key());
516535
configValuesForValidation.add(VM_USERDATA_MAX_LENGTH_STRING);
536+
configValuesForValidation.add(KVM_HEARTBEAT_UPDATE_MAX_RETRIES);
537+
configValuesForValidation.add(KVM_HEARTBEAT_UPDATE_RETRY_SLEEP);
538+
configValuesForValidation.add(KVM_HEARTBEAT_UPDATE_TIMEOUT);
517539
}
518540

519541
private void weightBasedParametersForValidation() {
@@ -546,23 +568,30 @@ private void overProvisioningFactorsForValidation() {
546568
}
547569

548570
private void initMessageBusListener() {
549-
messageBus.subscribe(EventTypes.EVENT_CONFIGURATION_VALUE_EDIT, new MessageSubscriber() {
550-
@Override
551-
public void onPublishMessage(String serderAddress, String subject, Object args) {
552-
String globalSettingUpdated = (String) args;
553-
if (StringUtils.isEmpty(globalSettingUpdated)) {
554-
return;
555-
}
556-
if (globalSettingUpdated.equals(ApiServiceConfiguration.ManagementServerAddresses.key()) ||
557-
globalSettingUpdated.equals(IndirectAgentLBServiceImpl.IndirectAgentLBAlgorithm.key())) {
558-
_indirectAgentLB.propagateMSListToAgents();
559-
} else if (globalSettingUpdated.equals(Config.RouterAggregationCommandEachTimeout.toString())
560-
|| globalSettingUpdated.equals(Config.MigrateWait.toString())) {
561-
Map<String, String> params = new HashMap<String, String>();
562-
params.put(Config.RouterAggregationCommandEachTimeout.toString(), _configDao.getValue(Config.RouterAggregationCommandEachTimeout.toString()));
563-
params.put(Config.MigrateWait.toString(), _configDao.getValue(Config.MigrateWait.toString()));
564-
_agentManager.propagateChangeToAgents(params);
565-
}
571+
Map<String, ConfigKey> configKeyMap = new HashMap<>();
572+
configKeyMap.put(KVM_HEARTBEAT_UPDATE_MAX_RETRIES, KVM_HEARTBEAT_UPDATE_MAX_RETRIES_CK);
573+
configKeyMap.put(KVM_HEARTBEAT_UPDATE_RETRY_SLEEP, KVM_HEARTBEAT_UPDATE_RETRY_SLEEP_CK);
574+
configKeyMap.put(KVM_HEARTBEAT_UPDATE_TIMEOUT, KVM_HEARTBEAT_UPDATE_TIMEOUT_CK);
575+
configKeyMap.put(KVM_HEARTBEAT_FAILURE_ACTION, KVM_HEARTBEAT_FAILURE_ACTION_CK);
576+
messageBus.subscribe(EventTypes.EVENT_CONFIGURATION_VALUE_EDIT, (serverAddress, subject, args) -> {
577+
String globalSettingUpdated = (String) args;
578+
if (Strings.isNullOrEmpty(globalSettingUpdated)) {
579+
return;
580+
}
581+
if (globalSettingUpdated.equals(ApiServiceConfiguration.ManagementServerAddresses.key()) ||
582+
globalSettingUpdated.equals(IndirectAgentLBServiceImpl.IndirectAgentLBAlgorithm.key())) {
583+
_indirectAgentLB.propagateMSListToAgents();
584+
} else if (globalSettingUpdated.equals(Config.RouterAggregationCommandEachTimeout.toString())
585+
|| globalSettingUpdated.equals(Config.MigrateWait.toString())) {
586+
Map<String, String> params = new HashMap<String, String>();
587+
params.put(Config.RouterAggregationCommandEachTimeout.toString(), _configDao.getValue(Config.RouterAggregationCommandEachTimeout.toString()));
588+
params.put(Config.MigrateWait.toString(), _configDao.getValue(Config.MigrateWait.toString()));
589+
_agentManager.propagateChangeToAgents(params);
590+
} else if (configKeyMap.keySet().contains(globalSettingUpdated)) {
591+
ConfigKey configKey = configKeyMap.get(globalSettingUpdated);
592+
Map<String, String> params = new HashMap<String, String>();
593+
params.put(configKey.key(), configKey.value().toString());
594+
_agentManager.propagateChangeToAgents(params);
566595
}
567596
});
568597
}
@@ -928,6 +957,15 @@ private String validateConfigurationValue(final String name, String value, final
928957
return errMsg;
929958
}
930959

960+
if (KVM_HEARTBEAT_FAILURE_ACTION.equalsIgnoreCase(name)) {
961+
List<String> kvmHeartBeatFailureActions = Arrays.asList("hardreset", "destroyvms", "stopagent");
962+
if (value == null || ! kvmHeartBeatFailureActions.contains(value.toLowerCase())) {
963+
final String msg = "Possible values for " + name + " are - " + Arrays.toString(kvmHeartBeatFailureActions.toArray());
964+
s_logger.error(msg);
965+
throw new InvalidParameterValueException(msg);
966+
}
967+
}
968+
931969
if (value == null) {
932970
if (type.equals(Boolean.class)) {
933971
return "Please enter either 'true' or 'false'.";
@@ -7047,7 +7085,8 @@ public ConfigKey<?>[] getConfigKeys() {
70477085
return new ConfigKey<?>[] {SystemVMUseLocalStorage, IOPS_MAX_READ_LENGTH, IOPS_MAX_WRITE_LENGTH,
70487086
BYTES_MAX_READ_LENGTH, BYTES_MAX_WRITE_LENGTH, ADD_HOST_ON_SERVICE_RESTART_KVM, SET_HOST_DOWN_TO_MAINTENANCE, VM_SERVICE_OFFERING_MAX_CPU_CORES,
70497087
VM_SERVICE_OFFERING_MAX_RAM_SIZE, VM_USERDATA_MAX_LENGTH, MIGRATE_VM_ACROSS_CLUSTERS,
7050-
ENABLE_ACCOUNT_SETTINGS_FOR_DOMAIN, ENABLE_DOMAIN_SETTINGS_FOR_CHILD_DOMAIN
7088+
ENABLE_ACCOUNT_SETTINGS_FOR_DOMAIN, ENABLE_DOMAIN_SETTINGS_FOR_CHILD_DOMAIN,
7089+
KVM_HEARTBEAT_UPDATE_MAX_RETRIES_CK, KVM_HEARTBEAT_UPDATE_RETRY_SLEEP_CK, KVM_HEARTBEAT_UPDATE_TIMEOUT_CK, KVM_HEARTBEAT_FAILURE_ACTION_CK
70517090
};
70527091
}
70537092
}

0 commit comments

Comments
 (0)