Skip to content

Commit 023dcec

Browse files
Slair1DaanHoogland
authored andcommitted
CLOUDSTACK-10310 Fix KVM reboot on storage issue (#2722)
1 parent 9b772db commit 023dcec

File tree

3 files changed

+15
-6
lines changed

3 files changed

+15
-6
lines changed

plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ public class KVMHABase {
3434
protected static String s_heartBeatPath;
3535
protected long _heartBeatUpdateTimeout = 60000;
3636
protected long _heartBeatUpdateFreq = 60000;
37-
protected long _heartBeatUpdateMaxRetry = 3;
37+
protected long _heartBeatUpdateMaxTries = 5;
38+
protected long _heartBeatUpdateRetrySleep = 15000;
3839

3940
public static enum PoolType {
4041
PrimaryStorage, SecondaryStorage

plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,22 +119,30 @@ protected void runInContext() {
119119
}
120120

121121
String result = null;
122-
for (int i = 0; i < 5; i++) {
122+
// Try multiple times, but sleep in between tries to ensure it isn't a short lived transient error
123+
for (int i = 1; i <= _heartBeatUpdateMaxTries; i++) {
123124
Script cmd = new Script(s_heartBeatPath, _heartBeatUpdateTimeout, s_logger);
124125
cmd.add("-i", primaryStoragePool._poolIp);
125126
cmd.add("-p", primaryStoragePool._poolMountSourcePath);
126127
cmd.add("-m", primaryStoragePool._mountDestPath);
127128
cmd.add("-h", _hostIP);
128129
result = cmd.execute();
129130
if (result != null) {
130-
s_logger.warn("write heartbeat failed: " + result + ", retry: " + i);
131+
s_logger.warn("write heartbeat failed: " + result + ", try: " + i + " of " + _heartBeatUpdateMaxTries);
132+
try {
133+
Thread.sleep(_heartBeatUpdateRetrySleep);
134+
} catch (InterruptedException e) {
135+
s_logger.debug("[ignored] interupted between heartbeat retries.");
136+
}
131137
} else {
132138
break;
133139
}
134140
}
135141

136142
if (result != null) {
137-
s_logger.warn("write heartbeat failed: " + result + "; reboot the host");
143+
// Stop cloudstack-agent if can't write to heartbeat file.
144+
// This will raise an alert on the mgmt server
145+
s_logger.warn("write heartbeat failed: " + result + "; stopping cloudstack-agent");
138146
Script cmd = new Script(s_heartBeatPath, _heartBeatUpdateTimeout, s_logger);
139147
cmd.add("-i", primaryStoragePool._poolIp);
140148
cmd.add("-p", primaryStoragePool._poolMountSourcePath);

scripts/vm/hypervisor/kvm/kvmheartbeat.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,10 +155,10 @@ then
155155
exit 0
156156
elif [ "$cflag" == "1" ]
157157
then
158-
/usr/bin/logger -t heartbeat "kvmheartbeat.sh rebooted system because it was unable to write the heartbeat to the storage."
158+
/usr/bin/logger -t heartbeat "kvmheartbeat.sh stopped cloudstack-agent because it was unable to write the heartbeat to the storage."
159159
sync &
160160
sleep 5
161-
echo b > /proc/sysrq-trigger
161+
service cloudstack-agent stop
162162
exit $?
163163
else
164164
write_hbLog

0 commit comments

Comments
 (0)