From 1547da47e0e4b5e1179699cc3fe1e2f7a03403c8 Mon Sep 17 00:00:00 2001
From: Carlo Lobrano <c.lobrano@gmail.com>
Date: Mon, 4 Aug 2025 18:31:39 +0200
Subject: [PATCH] Add watch-pcs-and-etcd helper for monitoring cluster status

The script continuously monitors both Pacemaker (PCS) status and etcd
member list by automatically switching between available cluster nodes
when connectivity issues occur.

It includes intelligent node discovery from inventory files or virsh.
---
 helpers/watch-pcs-and-etcd.sh | 198 ++++++++++++++++++++++++++++++++++
 1 file changed, 198 insertions(+)
 create mode 100755 helpers/watch-pcs-and-etcd.sh

diff --git a/helpers/watch-pcs-and-etcd.sh b/helpers/watch-pcs-and-etcd.sh
new file mode 100755
index 0000000..351b13c
--- /dev/null
+++ b/helpers/watch-pcs-and-etcd.sh
@@ -0,0 +1,198 @@
+#!/usr/bin/env bash
+# Continuously display the PCS status and the Etcd memberlist by switching between available nodes.
+# Assumes you have passwordless sudo configured for the user running this script
+# or are running as root.
+
+trap 'echo "Interrupted. Exiting..."; exit 0' SIGINT
+
+# Node discovery functions
+get_nodes_from_inventory() {
+	local inventory_file="inventory.ini"
+	if [ ! -f "$inventory_file" ]; then
+		return 1
+	fi
+	
+	# Parse ansible inventory for nodes in cluster_nodes section
+	awk '
+	/^\[cluster_nodes\]$/ { in_section=1; next }
+	/^\[/ { in_section=0 }
+	in_section && /^core@[0-9]/ {
+		# Extract core@IP from lines like: core@192.168.111.20 ansible_ssh_extra_args=...
+		split($1, parts, " ")
+		print parts[1]
+	}
+	in_section && /^[0-9]/ {
+		# Handle lines like: 192.168.111.20 ansible_user=core
+		split($1, ip, " ")
+		print "core@" ip[1]
+	}
+	' "$inventory_file"
+}
+
+get_nodes_from_virsh() {
+	set -x
+	# Get VM IPs from virsh - dynamically discover running VMs
+	local nodes=()
+	
+	# Get list of running VMs
+	local vms
+	if ! mapfile -t vms< <(virsh list --state-running --name 2>/dev/null); then
+		return 1
+	fi
+	
+	# For each running VM, try to get its IP
+	for vm in "${vms[@]}"; do
+		[ -n "$vm" ] || continue  # Skip empty lines
+		
+		local ip
+		local vm_ip_found=false
+		
+		# First try domifaddr (works with default network DHCP)
+		if ip=$(virsh domifaddr "$vm" 2>/dev/null | awk '/ipv4/ {print $4}' | cut -d'/' -f1 | head -1); then
+			if [ -n "$ip" ]; then
+				nodes+=("core@$ip")
+				vm_ip_found=true
+			fi
+		fi
+		
+		# If domifaddr fails or no IP found yet, try ARP for custom bridge networks
+		if [ "$vm_ip_found" = false ]; then
+			# Get MAC addresses for the tnfbm bridge network, which dev-scripts usually creates
+			if ! mac=$(virsh domiflist "$vm" 2>/dev/null | awk '/tnfbm/ {print $5}'); then
+				# could not get interface
+				break
+			elif [ -z "$mac" ]; then
+				# could not get MAC address
+				break
+			else
+				# Look up IP via ARP table using MAC address
+				if ip=$(arp -a | grep -i "$mac" | awk '{print $2}' | tr -d '()' | head -1); then
+					if [ -n "$ip" ] && [[ "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+						# Validate that this IP is actually reachable and belongs to a CoreOS node
+						if timeout 3 ssh -o ConnectTimeout=2 -o BatchMode=yes "core@$ip" -- echo "ping" >/dev/null 2>&1; then
+							nodes+=("core@$ip")
+						fi
+					fi
+				fi
+			fi
+		fi
+	done
+	
+	if [ ${#nodes[@]} -gt 0 ]; then
+		printf '%s\n' "${nodes[@]}"
+	else
+		return 1
+	fi
+}
+
+discover_nodes() {
+	# Try inventory file first, then virsh, then fallback to hostnames
+	if get_nodes_from_inventory 2>/dev/null; then
+		echo "# Using nodes from inventory.ini" >&2
+	elif get_nodes_from_virsh 2>/dev/null; then
+		echo "# Using nodes discovered via virsh" >&2
+	else
+		echo "! Could not discover nodes" >&2
+		exit 1
+	fi
+}
+
+# Initialize nodes array
+echo "Discovering cluster nodes..."
+mapfile -t NODES < <(discover_nodes)
+echo "Found ${#NODES[@]} nodes: ${NODES[*]}"
+working_node="${NODES[0]}"
+
+get_other_node() {
+	local current=$1
+	for i in "${!NODES[@]}"; do
+		if [ "${NODES[i]}" = "$current" ]; then
+			local next_index=$(( (i + 1) % ${#NODES[@]} ))
+			echo "${NODES[next_index]}"
+			return 0
+		fi
+	done
+	# Fallback if not found
+	echo "${NODES[0]}"
+}
+
+test_etcdctl_command() {
+	local node=$1
+	local timeout=3
+	
+	# Test if etcdctl command works on this node
+	timeout "$timeout" ssh -o ConnectTimeout=2 "$node" -- \
+		sudo podman exec etcd etcdctl member list --command-timeout=1s >/dev/null 2>&1
+	return $?
+}
+
+watch_cluster() {
+	local retry_count=0
+	local max_retries=6
+	local max_failures=3
+	
+	while true; do
+		echo "Getting Etcd member list from $working_node..."
+		
+		# We try to use the node where Etcdctl is able to respond
+		if test_etcdctl_command "$working_node"; then
+			echo "Starting watch on $working_node (press Ctrl+C to exit)..."
+			
+			# Cannot use `watch`, it won't be able to detect some command failures and it will
+			# make us stuck instead than move to the other node.
+			while true; do
+				# Capture output first to minimize screen flashing
+				local output_etcd
+				local header_etcd
+				local output_pcs
+				local header_pcs
+				local separator="========================================"
+
+				header_etcd="Etcd member list from $working_node ($(date)):"
+				header_pcs="Pacemaker status from $working_node ($(date)):"
+
+				if ! output_pcs=$(ssh -o ConnectTimeout=3 "$working_node" -- sudo pcs status 2>/dev/null); then
+					clear
+					echo "$header_pcs: Command failed on $working_node, switching nodes..."
+					break
+				fi
+
+				if ! output_etcd=$(ssh -o ConnectTimeout=3 "$working_node" -- \
+					sudo podman exec etcd etcdctl member list "$@" --command-timeout=2s 2>/dev/null); then
+					clear
+					echo "$header_etcd: Command failed on $working_node, switching nodes..."
+					break
+				fi
+				# Success - quickly clear and display
+				clear
+
+				echo -e "\n$header_pcs"
+				echo "$separator"
+				echo -e "$output_pcs\n"
+
+				echo -e "\n$header_etcd"
+				echo "$separator"
+				echo "$output_etcd"
+
+				sleep 5
+			done
+		else
+			echo "Command test failed on $working_node, switching to other node..."
+		fi
+		
+		# Switch to other node
+		working_node=$(get_other_node "$working_node")
+		retry_count=$((retry_count + 1))
+		
+		if [ $retry_count -ge $max_retries ]; then
+			echo "All nodes failed after $max_retries attempts. Waiting 10 seconds before retrying..."
+			sleep 10
+			retry_count=0
+		else
+			sleep 1
+		fi
+	done
+}
+
+# Run the watch function
+watch_cluster "$*"