Skip to content

Commit 27fc217

Browse files
committed
CLOUDSTACK-10310 Fix KVM reboot on storage issue
1 parent 8daf634 commit 27fc217

File tree

3 files changed

+15
-6
lines changed

3 files changed

+15
-6
lines changed

plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ public class KVMHABase {
3434
protected static String s_heartBeatPath;
3535
protected long _heartBeatUpdateTimeout = 60000;
3636
protected long _heartBeatUpdateFreq = 60000;
37-
protected long _heartBeatUpdateMaxRetry = 3;
37+
protected long _heartBeatUpdateMaxTries = 5;
38+
protected long _heartBeatUpdateRetrySleep = 15000;
3839

3940
public static enum PoolType {
4041
PrimaryStorage, SecondaryStorage

plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,22 +115,30 @@ protected void runInContext() {
115115
}
116116

117117
String result = null;
118-
for (int i = 0; i < 5; i++) {
118+
// Try multiple times, but sleep in between tries to ensure it isn't a short lived transient error
119+
for (int i = 1; i <= _heartBeatUpdateMaxTries; i++) {
119120
Script cmd = new Script(s_heartBeatPath, _heartBeatUpdateTimeout, s_logger);
120121
cmd.add("-i", primaryStoragePool._poolIp);
121122
cmd.add("-p", primaryStoragePool._poolMountSourcePath);
122123
cmd.add("-m", primaryStoragePool._mountDestPath);
123124
cmd.add("-h", _hostIP);
124125
result = cmd.execute();
125126
if (result != null) {
126-
s_logger.warn("write heartbeat failed: " + result + ", retry: " + i);
127+
s_logger.warn("write heartbeat failed: " + result + ", try: " + i + " of " + _heartBeatUpdateMaxTries);
128+
try {
129+
Thread.sleep(_heartBeatUpdateRetrySleep);
130+
} catch (InterruptedException e) {
131+
s_logger.debug("[ignored] interupted between heartbeat retries.");
132+
}
127133
} else {
128134
break;
129135
}
130136
}
131137

132138
if (result != null) {
133-
s_logger.warn("write heartbeat failed: " + result + "; reboot the host");
139+
// Stop cloudstack-agent if can't write to heartbeat file.
140+
// This will raise an alert on the mgmt server
141+
s_logger.warn("write heartbeat failed: " + result + "; stopping cloudstack-agent");
134142
Script cmd = new Script(s_heartBeatPath, _heartBeatUpdateTimeout, s_logger);
135143
cmd.add("-i", primaryStoragePool._poolIp);
136144
cmd.add("-p", primaryStoragePool._poolMountSourcePath);

scripts/vm/hypervisor/kvm/kvmheartbeat.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,10 +155,10 @@ then
155155
exit 0
156156
elif [ "$cflag" == "1" ]
157157
then
158-
/usr/bin/logger -t heartbeat "kvmheartbeat.sh rebooted system because it was unable to write the heartbeat to the storage."
158+
/usr/bin/logger -t heartbeat "kvmheartbeat.sh stopped cloudstack-agent because it was unable to write the heartbeat to the storage."
159159
sync &
160160
sleep 5
161-
echo b > /proc/sysrq-trigger
161+
service cloudstack-agent stop
162162
exit $?
163163
else
164164
write_hbLog

0 commit comments

Comments
 (0)