@@ -43,10 +43,10 @@ public class KVMHAMonitor extends KVMHABase implements Runnable {
4343 private final Map <String , NfsStoragePool > storagePool = new ConcurrentHashMap <>();
4444 private Set <String > removedPools = new HashSet <>();
4545 private final boolean rebootHostAndAlertManagementOnHeartbeatTimeout ;
46- private final Map <String , CheckPoolThread > _storagePoolCheckThreads = new HashMap <String , CheckPoolThread >();
47- private final Map <String , String > _storagePoolCheckStatus = new HashMap <String , String >();
48- private final static String STATUS_RUNNING = "Running" ;
49- private final static String STATUS_TERMINATED = "Terminated" ;
46+ private final Map <String , CheckPoolThread > storagePoolCheckThreads = new HashMap <>();
47+ private final Map <String , String > storagePoolCheckStatus = new HashMap <>();
48+ private static final String STATUS_RUNNING = "Running" ;
49+ private static final String STATUS_TERMINATED = "Terminated" ;
5050
5151 private final String hostPrivateIp ;
5252
@@ -57,7 +57,7 @@ public KVMHAMonitor(NfsStoragePool pool, String host, String scriptPath) {
5757 hostPrivateIp = host ;
5858 configureHeartBeatPath (scriptPath );
5959
60- _heartBeatUpdateTimeout = AgentPropertiesFileHandler .getPropertyValue (AgentProperties .HEARTBEAT_UPDATE_TIMEOUT );
60+ s_heartBeatUpdateTimeout = AgentPropertiesFileHandler .getPropertyValue (AgentProperties .HEARTBEAT_UPDATE_TIMEOUT );
6161 rebootHostAndAlertManagementOnHeartbeatTimeout = AgentPropertiesFileHandler .getPropertyValue (AgentProperties .REBOOT_HOST_AND_ALERT_MANAGEMENT_ON_HEARTBEAT_TIMEOUT );
6262 }
6363
@@ -72,16 +72,16 @@ public static synchronized void configureHeartBeatParams(Long heartBeatUpdateMax
7272 s_logger .debug (String .format ("Configuring heartbeat params: max retries = %s, retry sleep = %s, timeout = %s, action = %s" ,
7373 heartBeatUpdateMaxTries , heartBeatUpdateRetrySleep , heartBeatUpdateTimeout , heartBeatFailureAction ));
7474 if (heartBeatUpdateMaxTries != null ) {
75- KVMHABase ._heartBeatUpdateMaxRetries = heartBeatUpdateMaxTries ;
75+ KVMHABase .s_heartBeatUpdateMaxRetries = heartBeatUpdateMaxTries ;
7676 }
7777 if (heartBeatUpdateRetrySleep != null ) {
78- KVMHABase ._heartBeatUpdateRetrySleep = heartBeatUpdateRetrySleep ;
78+ KVMHABase .s_heartBeatUpdateRetrySleep = heartBeatUpdateRetrySleep ;
7979 }
8080 if (heartBeatUpdateTimeout != null ) {
81- KVMHABase ._heartBeatUpdateTimeout = heartBeatUpdateTimeout ;
81+ KVMHABase .s_heartBeatUpdateTimeout = heartBeatUpdateTimeout ;
8282 }
8383 if (heartBeatFailureAction != null ) {
84- KVMHABase ._heartBeatFailureAction = heartBeatFailureAction ;
84+ KVMHABase .s_heartBeatFailureAction = heartBeatFailureAction ;
8585 }
8686 }
8787
@@ -128,13 +128,13 @@ public void runInContext() {
128128 private void check () {
129129 if (! storagePool .containsKey (primaryStoragePool ._poolUUID )) {
130130 s_logger .info ("Removing check on storage pool as it has been removed: " + primaryStoragePool ._poolUUID );
131- _storagePoolCheckStatus .remove (primaryStoragePool ._poolUUID );
132- _storagePoolCheckThreads .remove (primaryStoragePool ._poolUUID );
131+ storagePoolCheckStatus .remove (primaryStoragePool ._poolUUID );
132+ storagePoolCheckThreads .remove (primaryStoragePool ._poolUUID );
133133 Thread .currentThread ().interrupt ();
134134 return ;
135135 }
136136
137- if (_storagePoolCheckStatus .containsKey (primaryStoragePool ._poolUUID )) {
137+ if (storagePoolCheckStatus .containsKey (primaryStoragePool ._poolUUID )) {
138138 s_logger .info ("Ignoring check on storage pool: " + primaryStoragePool ._poolUUID );
139139 return ;
140140 }
@@ -143,67 +143,58 @@ private void check() {
143143
144144 String result = null ;
145145 // Try multiple times, but sleep in between tries to ensure it isn't a short lived transient error
146- for (int i = 1 ; i <= _heartBeatUpdateMaxRetries ; i ++) {
147- s_logger .info (String .format ("Trying to write heartbeat to pool %s %s of %s times" , primaryStoragePool ._mountDestPath , i , _heartBeatUpdateMaxRetries ));
146+ for (int i = 1 ; i <= s_heartBeatUpdateMaxRetries ; i ++) {
147+ s_logger .info (String .format ("Trying to write heartbeat to pool %s %s of %s times" , primaryStoragePool ._mountDestPath , i , s_heartBeatUpdateMaxRetries ));
148148 Script cmd = createHeartBeatCommand (primaryStoragePool , hostPrivateIp , true );
149149 result = cmd .execute ();
150150 s_logger .debug (String .format ("The command (%s), to the pool [%s], has the result [%s]." , cmd .toString (), primaryStoragePool ._poolUUID , result ));
151151 if (result != null ) {
152- s_logger .warn (String .format ("Write heartbeat for pool [%s] failed: %s; try: %s of %s." , primaryStoragePool ._poolUUID , result , i , _heartBeatUpdateMaxRetries ));
153- _storagePoolCheckStatus .put (primaryStoragePool ._poolUUID , STATUS_RUNNING );
154- if (i < _heartBeatUpdateMaxRetries ) {
155- while (true ) {
156- try {
157- Thread .currentThread ().sleep (_heartBeatUpdateRetrySleep );
158- break ;
159- } catch (InterruptedException e ) {
160- s_logger .debug ("[ignored] interupted between heartbeat retries with error message: " + e .getMessage ());
161- }
152+ s_logger .warn (String .format ("Write heartbeat for pool [%s] failed: %s; try: %s of %s." , primaryStoragePool ._poolUUID , result , i , s_heartBeatUpdateMaxRetries ));
153+ storagePoolCheckStatus .put (primaryStoragePool ._poolUUID , STATUS_RUNNING );
154+ if (i < s_heartBeatUpdateMaxRetries ) {
155+ try {
156+ Thread .sleep (s_heartBeatUpdateRetrySleep );
157+ break ;
158+ } catch (InterruptedException e ) {
159+ s_logger .debug ("[ignored] interrupted between heartbeat retries with error message: " + e .getMessage ());
162160 }
163161 }
164162 } else {
165- _storagePoolCheckStatus .remove (primaryStoragePool ._poolUUID );
163+ storagePoolCheckStatus .remove (primaryStoragePool ._poolUUID );
166164 break ;
167165 }
168166 }
169167
170168 if (result != null ) {
171- // Perform action if can't write to heartbeat file.
169+ // Perform action if it can't write to heartbeat file.
172170 // This will raise an alert on the mgmt server
173- s_logger .warn ("write heartbeat failed: " + result );
174- if (HeartBeatAction .NOACTION .equals (s_heartBeatFailureAction )) {
175- s_logger .warn ("No action will be performed on storage pool: " + primaryStoragePool ._poolUUID );
176- _storagePoolCheckStatus .remove (primaryStoragePool ._poolUUID );
177- return true ;
178- }
171+ s_logger .warn (String .format ("write heartbeat for pool [%s] failed: %s" , primaryStoragePool ._poolUUID , result ));
179172
180173 performAction (primaryStoragePool );
181- _storagePoolCheckStatus .put (primaryStoragePool ._poolUUID , STATUS_TERMINATED );
182- s_logger .debug ("End performing action " + _heartBeatFailureAction + " on storage pool: " + primaryStoragePool ._poolUUID );
174+ storagePoolCheckStatus .put (primaryStoragePool ._poolUUID , STATUS_TERMINATED );
175+ s_logger .debug ("End performing action " + s_heartBeatFailureAction + " on storage pool: " + primaryStoragePool ._poolUUID );
183176 return ;
184177 }
185178
186179 s_logger .debug ("End checking on storage pool " + primaryStoragePool ._poolUUID );
187180 }
188181
189182 private void performAction (NfsStoragePool primaryStoragePool ) {
190- s_logger .warn ("Performing action " + _heartBeatFailureAction + " on storage pool: " + primaryStoragePool ._poolUUID );
191-
183+ s_logger .warn ("Performing action " + s_heartBeatFailureAction + " on storage pool: " + primaryStoragePool ._poolUUID );
192184 Script cmd = createHeartBeatCommand (primaryStoragePool , null , false );
193- // give priority to action defined in agent.properties file
194- if (rebootHostAndAlertManagementOnHeartbeatTimeout ) {
195- s_logger .warn (String . format ( "Write heartbeat for pool [%s] failed; stopping cloudstack-agent." , primaryStoragePool ._poolUUID ) );
196- cmd . execute ( );
185+
186+ if (HeartBeatAction . NOACTION . equals ( s_heartBeatFailureAction ) ) {
187+ s_logger .warn ("No action will be performed on storage pool: " + primaryStoragePool ._poolUUID );
188+ storagePoolCheckStatus . remove ( primaryStoragePool . _poolUUID );
197189 return ;
198190 }
199191
200- if (HeartBeatAction .DESTROYVMS .equals (_heartBeatFailureAction )
201- || HeartBeatAction .HARDRESET .equals (_heartBeatFailureAction )) {
202- String destroyvmsCmd = "ps aux | grep '" + LibvirtVMDef .MANUFACTURER_APACHE + "' | grep -v ' grep '" ;
203- if (HeartBeatAction .DESTROYVMS .equals (_heartBeatFailureAction )) {
204- destroyvmsCmd += " | grep " + primaryStoragePool ._mountDestPath ;
205- }
206- destroyvmsCmd += " | awk '{print $14}' | tr '\n ' ','" ;
192+ if (HeartBeatAction .DESTROYVMS .equals (s_heartBeatFailureAction )) {
193+ String destroyvmsCmd = "ps aux | grep '" + LibvirtVMDef .MANUFACTURER_APACHE +
194+ "' | grep -v ' grep '" + " | grep " + primaryStoragePool ._mountDestPath +
195+ " | awk '{print $14}' | tr '\n ' ','" ;
196+
197+ // display the vm names which are going to be destroyed
207198 String destroyvms = Script .runSimpleBashScript (destroyvmsCmd );
208199 if (destroyvms != null ) {
209200 s_logger .warn ("The following vms will be destroyed: " + destroyvms );
@@ -213,20 +204,18 @@ private void performAction(NfsStoragePool primaryStoragePool) {
213204 }
214205
215206 // take action according to global setting
216- cmd .add (_heartBeatFailureAction .getFlag ());
207+ cmd .add (s_heartBeatFailureAction .getFlag ());
217208 cmd .execute ();
218209 }
219210
220211 private Script createHeartBeatCommand (NfsStoragePool primaryStoragePool , String hostPrivateIp , boolean hostValidation ) {
221- Script cmd = new Script (s_heartBeatPath , _heartBeatUpdateTimeout , s_logger );
212+ Script cmd = new Script (s_heartBeatPath , s_heartBeatUpdateTimeout , s_logger );
222213 cmd .add ("-i" , primaryStoragePool ._poolIp );
223214 cmd .add ("-p" , primaryStoragePool ._poolMountSourcePath );
224215 cmd .add ("-m" , primaryStoragePool ._mountDestPath );
225216
226217 if (hostValidation ) {
227218 cmd .add ("-h" , hostPrivateIp );
228- } else {
229- cmd .add ("-c" );
230219 }
231220
232221 return cmd ;
@@ -268,7 +257,7 @@ private boolean checkPoolValidity(String uuid) {
268257 protected void runInContext () {
269258 synchronized (storagePool ) {
270259 for (String uuid : storagePool .keySet ()) {
271- if (_storagePoolCheckThreads .containsKey (uuid )) {
260+ if (storagePoolCheckThreads .containsKey (uuid )) {
272261 s_logger .trace ("Ignoring check on storage pool as there is already a thread: " + uuid );
273262 continue ;
274263 }
@@ -278,30 +267,12 @@ protected void runInContext() {
278267 uuid , primaryStoragePool ._poolIp , primaryStoragePool ._poolMountSourcePath , primaryStoragePool ._mountDestPath ));
279268
280269 CheckPoolThread checkPoolThread = new CheckPoolThread (primaryStoragePool );
281- _storagePoolCheckThreads .put (uuid , checkPoolThread );
270+ storagePoolCheckThreads .put (uuid , checkPoolThread );
282271 checkPoolThread .runInContext ();
283272 } else {
284273 removedPools .add (uuid );
285274 }
286275 }
287-
288- if (! _storagePoolCheckStatus .isEmpty ()) {
289- boolean isAllTerminated = true ;
290- for (Map .Entry <String , String > entry : _storagePoolCheckStatus .entrySet ()) {
291- String status = entry .getValue ();
292- s_logger .debug (String .format ("State of check thread for pool %s is %s" , entry .getKey (), status ));
293- if (!status .equals (STATUS_TERMINATED )) {
294- isAllTerminated = false ;
295- }
296- }
297- if (isAllTerminated ) {
298- s_logger .debug ("All heartbeat check threads on pools with issues are terminated, stopping cloudstack-agent" );
299- Script cmd = new Script ("/bin/systemctl" , s_logger );
300- cmd .add ("stop" );
301- cmd .add ("cloudstack-agent" );
302- cmd .execute ();
303- }
304- }
305276 }
306277
307278 if (!removedPools .isEmpty ()) {
@@ -316,7 +287,7 @@ protected void runInContext() {
316287 @ Override
317288 public void run () {
318289 ScheduledExecutorService haMonitor = Executors .newSingleThreadScheduledExecutor ();
319- haMonitor .scheduleAtFixedRate (new Monitor (), 0 , _heartBeatUpdateFreq , TimeUnit .SECONDS );
290+ haMonitor .scheduleAtFixedRate (new Monitor (), 0 , s_heartBeatUpdateFreq , TimeUnit .MILLISECONDS );
320291 }
321292
322293}
0 commit comments