From 4c4c67c5b4d480d71fde302fbdf1798d49851828 Mon Sep 17 00:00:00 2001
From: Misbah Anjum N <misanjum@linux.vnet.ibm.com>
Date: Fri, 14 Feb 2025 15:25:19 +0530
Subject: [PATCH 1/2] multivm-stress:Update script to test all edgecases

This patch captures multiple edge cases to test multivm scenarios. The following updates are added:

add stress_time parameter to run stress test for n seconds before starting stress_events
add debug_dir parameter to save the the debug files
add dump_options parameter to specify virsh dump type
update guest on_crash value to preserve in case of crash
add function check_call_traces to check for any call trace in dmesg
during stress, check for guest state and call traces every ten minutes
if any crashed vms, dump the vm to the debug_dir for further analysis
run stress_events in the remaining stable vms if present, else skip
check for error messages and fail the test if found

Signed-off-by: Misbah Anjum N <misanjum@linux.vnet.ibm.com>
---
 .../src/multivm_stress/multivm_stress.py      | 211 ++++++++++++++++--
 1 file changed, 193 insertions(+), 18 deletions(-)

diff --git a/libvirt/tests/src/multivm_stress/multivm_stress.py b/libvirt/tests/src/multivm_stress/multivm_stress.py
index cd74b0c122f..9f7107fa78e 100644
--- a/libvirt/tests/src/multivm_stress/multivm_stress.py
+++ b/libvirt/tests/src/multivm_stress/multivm_stress.py
@@ -1,8 +1,11 @@
 import logging as log
+import time
 
 from virttest import utils_stress
 from virttest import error_context
 from virttest import utils_test
+from virttest import virsh
+from virttest.libvirt_xml import vm_xml
 
 
 # Using as lower capital is not the best way to do, but this is just a
@@ -20,38 +23,210 @@ def run(test, params, env):
 
     guest_stress = params.get("guest_stress", "no") == "yes"
     host_stress = params.get("host_stress", "no") == "yes"
-    stress_events = params.get("stress_events", "reboot")
+    stress_events = params.get("stress_events", "")
+    stress_time = params.get("stress_time", "30")
+    debug_dir = params.get("debug_dir", "/home/")
+    dump_options = params.get("dump_options", "--memory-only --bypass-cache")
     vms = env.get_all_vms()
     vms_uptime_init = {}
+
     if "reboot" not in stress_events:
         for vm in vms:
             vms_uptime_init[vm.name] = vm.uptime()
-    stress_event = utils_stress.VMStressEvents(params, env)
+
     if guest_stress:
+        # change the on_crash value to "preserve" when guest crashes
+        for vm in vms:
+            logging.debug("Setting on_crash to preserve in %s" % vm.name)
+            vmxml = vm_xml.VMXML.new_from_inactive_dumpxml(vm.name)
+            if vm.is_alive():
+                vm.destroy(gracefully=False)
+            vmxml.on_crash = "preserve"
+            vmxml.sync()
+            vm.start()
+
         try:
             utils_test.load_stress("stress_in_vms", params=params, vms=vms)
         except Exception as err:
-            test.fail("Error running stress in vms: %s" % err)
+            test.fail("Error running stress in vms: %s" % str(err))
+
     if host_stress:
         if params.get("host_stress_args", ""):
             params["stress_args"] = params.get("host_stress_args")
         try:
             utils_test.load_stress("stress_on_host", params=params)
         except Exception as err:
-            test.fail("Error running stress in host: %s" % err)
-    try:
-        stress_event.run_threads()
-    finally:
-        stress_event.wait_for_threads()
-        if guest_stress:
-            utils_test.unload_stress("stress_in_vms", params=params, vms=vms)
-        if host_stress:
-            utils_test.unload_stress("stress_on_host", params=params)
-        if "reboot" not in stress_events:
-            fail = False
+            test.fail("Error running stress in host: %s" % str(err))
+
+    stress_timer = int(stress_time)
+    fail = False
+    found_traces = False
+    failed_vms = []
+    login_error_vms = []
+    unexpected_reboot_vms = []
+    error_message = ""
+
+    if guest_stress:
+        # check for any call traces in guest dmesg while stress is running
+        def check_call_traces(vm):
+            nonlocal stress_timer
+            found_trace = False
+            try:
+                retry_login = True
+                retry_times = 0
+                while retry_login:
+                    try:
+                        retry_login = False
+                        session = vm.wait_for_login(timeout=100)
+                        if vm in login_error_vms:
+                            login_error_vms.remove(vm)
+
+                    except Exception:
+                        stress_timer -= 150
+                        if vm in login_error_vms:
+                            return False
+
+                        retry_login = True
+                        retry_times += 1
+                        if retry_times == 3:
+                            logging.debug("Error in logging into %s" % vm.name)
+                            if vm not in login_error_vms:
+                                login_error_vms.append(vm)
+                            return False
+
+                        time.sleep(30)
+                        stress_timer -= 30
+
+                dmesg = session.cmd("dmesg")
+                dmesg_level = session.cmd("dmesg -l emerg,alert,crit")
+                if "Call Trace" in dmesg or len(dmesg_level) >= 1:
+                    logging.debug("Call trace found in %s" % vm.name)
+                    if vm not in failed_vms:
+                        failed_vms.append(vm)
+                    found_trace = True
+                session.close()
+
+            except Exception as err:
+                test.error("Error getting dmesg of %s due to %s" % (vm.name, str(err)))
+            return found_trace
+
+        # run stress for stress_time seconds
+        logging.debug("Sleeping for %s seconds waiting for stress completion" % stress_time)
+        stress_time = int(stress_time)
+
+        # check domstate of vms after stress_time
+        if stress_time < 600:
+            time.sleep(stress_time)
             for vm in vms:
-                if vm.uptime() < vms_uptime_init[vm.name]:
-                    logging.error("Unexpected reboot of VM: %s between test", vm.name)
+                if vm.state() != "running":
+                    logging.debug("%s state is %s" % (vm.name, vm.state()))
+                    failed_vms.append(vm)
                     fail = True
-            if fail:
-                test.fail("Unexpected VM reboot detected")
+                else:
+                    found_traces = check_call_traces(vm)
+                    if found_traces:
+                        fail = True
+                    time.sleep(2)
+
+        # check domstate of vms for every 5 minutes during stress_time
+        else:
+            all_failed = False
+            number_of_checks = int(stress_time / 600)
+            delta_time = int(stress_time % 600)
+            for itr in range(number_of_checks):
+                if len(failed_vms) == len(vms) or len(login_error_vms) == len(vms):
+                    all_failed = True
+                    break
+                if stress_timer <= 0:
+                    break
+                time.sleep(600)
+                for vm in vms:
+                    if vm.state() != "running":
+                        logging.debug("%s state is %s" % (vm.name, vm.state()))
+                        if vm not in failed_vms:
+                            failed_vms.append(vm)
+                        fail = True
+                    else:
+                        found_traces = check_call_traces(vm)
+                        if found_traces:
+                            fail = True
+                        time.sleep(3)
+                        stress_timer -= 3
+
+            if delta_time > 0 and stress_timer > 0 and not all_failed:
+                time.sleep(delta_time)
+                for vm in vms:
+                    if vm.state() != "running":
+                        logging.debug("%s state is %s" % (vm.name, vm.state()))
+                        if vm not in failed_vms:
+                            failed_vms.append(vm)
+                        fail = True
+                    else:
+                        found_traces = check_call_traces(vm)
+                        if found_traces:
+                            fail = True
+                        time.sleep(3)
+                        stress_timer -= 3
+
+        # virsh dump the failed vms into debug_dir
+        if fail:
+            for vm in failed_vms:
+                if vm.state() != "shut off":
+                    logging.debug("Dumping %s to debug_dir %s" % (vm.name, debug_dir))
+                    virsh.dump(vm.name, debug_dir+vm.name+"-core", dump_options, ignore_status=False, debug=True)
+                    logging.debug("Successfully dumped %s as %s-core" % (vm.name, vm.name))
+                else:
+                    logging.debug("Cannot dump %s as it is in shut off state" % vm.name)
+            failed_vms_string = ", ".join(vm.name for vm in failed_vms)
+            error_message = "Failure in " + failed_vms_string + " while running stress. "
+
+        if login_error_vms:
+            login_error_vms_string = ", ".join(vm.name for vm in login_error_vms)
+            error_message += "Login error in " + login_error_vms_string + " while running stress. "
+
+        if len(failed_vms) == len(vms) or len(login_error_vms) == len(vms):
+            error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS"
+            test.fail(error_message)
+
+    # run STRESS EVENTS in the remaining stable guests
+    if len(failed_vms) < len(vms) and len(login_error_vms) < len(vms):
+        for vm in failed_vms:
+            if vm in vms:
+                vms.remove(vm)
+        for vm in login_error_vms:
+            if vm in vms:
+                vms.remove(vm)
+
+        if len(vms) == 0:
+            error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS"
+            test.fail(error_message)
+
+        new_vms = ", ".join(vm.name for vm in vms)
+        try:
+            if stress_events != "":
+                logging.debug("Running stress_events in %s" % new_vms)
+                stress_event = utils_stress.VMStressEvents(params, env, vms)
+                stress_event.run_threads()
+                stress_event.wait_for_threads()
+
+            if guest_stress:
+                utils_test.unload_stress("stress_in_vms", params=params, vms=vms)
+
+            if host_stress:
+                utils_test.unload_stress("stress_on_host", params=params)
+
+            if "reboot" not in stress_events:
+                for vm in vms:
+                    if vm.uptime() < vms_uptime_init[vm.name]:
+                        logging.debug("Unexpected reboot of VM: %s between test", vm.name)
+                        unexpected_reboot_vms.append(vm)
+                unexpected_reboot_vms_string = ", ".join(vm.name for vm in unexpected_reboot_vms)
+                if unexpected_reboot_vms:
+                    error_message += "Unexpected reboot of guest(s) " + unexpected_reboot_vms_string + ". "
+
+        except Exception as err:
+            error_message += "Failure running STRESS EVENTS in " + new_vms + " due to" + str(err)
+
+    # check the test status
+    if error_message:
+        test.fail(error_message)
\ No newline at end of file

From 71bde6ec415ebb1288a7a75d527703bf22bf9646 Mon Sep 17 00:00:00 2001
From: Tasmiya Nalatwad <tasmiya@linux.vnet.ibm.com>
Date: Wed, 5 Mar 2025 18:01:22 +0530
Subject: [PATCH 2/2] Tests perform PCI_PT of NIC devices and checks ping And
 Tests pci device persistent across Multiple Reboot of VM 1. Changes made to
 support ppc64 arch and perform pci PT of network devices 2. Perform ping to
 other server ip and check the device network connectivity 3. Added test to
 check the device availability after multiple reboots of guest is done

Signed-off-by: Tasmiya Nalatwad <tasmiya@linux.vnet.ibm.com>
---
 .../pci/libvirt_pci_passthrough.cfg           |   4 +
 .../pci/libvirt_pci_passthrough.py            | 116 +++++++++++-------
 2 files changed, 74 insertions(+), 46 deletions(-)

diff --git a/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough.cfg b/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough.cfg
index f0354e0c17a..cba3a9a29cb 100644
--- a/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough.cfg
+++ b/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough.cfg
@@ -45,3 +45,7 @@
             operation = "suspend"
         - passthrough_shutdown_start:
             operation = "shutdown"
+        - passthrough_multiple_reboots:
+            number_of_reboots = 15
+            operation = "reboot"
+            supported_err = "not supported by the connection driver: virDomainReboot"
diff --git a/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough.py b/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough.py
index 3fc9d9e9e55..f95c5a120c7 100644
--- a/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough.py
+++ b/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough.py
@@ -1,5 +1,6 @@
 import logging as log
 import ipaddress
+import platform
 import time
 
 from virttest import virsh, virt_vm
@@ -39,6 +40,8 @@ def run(test, params, env):
             i) Reboot.
             ii) Suspend/Resume.
             iii) Start/Shutdown.
+    d). Multiple Reboots:
+        1. Checking PCI Device remains persistent across multiple reboots
     """
 
     def guest_lifecycle():
@@ -93,6 +96,8 @@ def guest_lifecycle():
     sriov = ('yes' == params.get("libvirt_pci_SRIOV", 'no'))
     device_type = params.get("libvirt_pci_device_type", "NIC")
     vm_vfs = int(params.get("number_vfs", 2))
+    number_of_reboots = int(params.get("number_of_reboots", "1"))
+    arch = platform.machine()
     pci_dev = None
     pci_address = None
     bus_info = []
@@ -174,59 +179,78 @@ def guest_lifecycle():
         pci_address = pci_xml.cap.get_address_dict()
         vmxml.add_hostdev(pci_address)
 
-    try:
-        for itr in range(iteration):
-            logging.info("Currently executing iteration number: '%s'", itr)
-            vmxml.sync()
-            vm.start()
-            session = vm.wait_for_login()
-            # The Network configuration is generic irrespective of PF or SRIOV VF
-            if device_type == "NIC":
-                if sorted(vm.get_pci_devices()) != sorted(nic_list_before):
-                    logging.debug("Adapter passthroughed to guest successfully")
-                else:
-                    test.fail("Passthrough adapter not found in guest.")
-                net_ip = ipaddress.ip_address(net_ip)
-                nic_list_after = vm.get_pci_devices()
-                nic_list = list(set(nic_list_after).difference(set(nic_list_before)))
-                for val in range(len(nic_list)):
-                    bus_info.append(str(nic_list[val]).split(' ', 1)[0])
-                    nic_list[val] = str(nic_list[val]).split(' ', 1)[0][:-2]
-                bus_info.sort()
-                if not sriov:
-                    # check all functions get same iommu group
+    def check_device_status(net_ip, server_ip, netmask):
+        logging.info("Currently executing iteration number: '%s'", itr)
+        vmxml.sync()
+        vm.start()
+        session = vm.wait_for_login()
+        # The Network configuration is generic irrespective of PF or SRIOV VF
+        if device_type == "NIC":
+            if sorted(vm.get_pci_devices()) != sorted(nic_list_before):
+                logging.debug("Adapter passthroughed to guest successfully")
+            else:
+                test.fail("Passthrough adapter not found in guest.")
+            net_ip = ipaddress.ip_address(net_ip)
+            nic_list_after = vm.get_pci_devices()
+            nic_list = list(set(nic_list_after).difference(set(nic_list_before)))
+            for val in range(len(nic_list)):
+                bus_info.append(str(nic_list[val]).split(' ', 1)[0])
+                nic_list[val] = str(nic_list[val]).split(' ', 1)[0][:-2]
+            bus_info.sort()
+            if not sriov:
+                # check all functions get same iommu group
+                # arch ppc64 gets different iommu group when attached to VM
+                if arch != "ppc64le":
                     if len(set(nic_list)) != 1:
                         test.fail("Multifunction Device passthroughed but "
                                   "functions are in different iommu group")
-                # ping to server from each function
-                for val in bus_info:
-                    nic_name = str(utils_misc.get_interface_from_pci_id(val, session))
-                    session.cmd("ip addr flush dev %s" % nic_name)
-                    session.cmd("ip addr add %s/%s dev %s"
-                                % (net_ip, netmask, nic_name))
-                    session.cmd("ip link set %s up" % nic_name)
-                    # Pinging using nic_name is having issue,
-                    # hence replaced with IPAddress
-                    s_ping, o_ping = utils_test.ping(server_ip, count=5,
-                                                     interface=net_ip, timeout=30,
-                                                     session=session)
-                    logging.info(o_ping)
-                    if s_ping != 0:
-                        err_msg = "Ping test fails, error info: '%s'"
-                        test.fail(err_msg % o_ping)
-                    # Each interface should have unique IP
+            # ping to server from each function
+            for val in bus_info:
+                nic_name = str(utils_misc.get_interface_from_pci_id(val, session))
+                session.cmd("ip addr flush dev %s" % nic_name)
+                session.cmd("ip addr add %s/%s dev %s"
+                            % (net_ip, netmask, nic_name))
+                session.cmd("ip link set %s up" % nic_name)
+                # Pinging using nic_name is having issue,
+                # hence replaced with IPAddress
+                s_ping, o_ping = utils_test.ping(server_ip, count=5,
+                                                 interface=net_ip, timeout=30,
+                                                 session=session)
+                logging.info(o_ping)
+                if s_ping != 0:
+                    err_msg = "Ping test fails, error info: '%s'"
+                    test.fail(err_msg % o_ping)
+                # Each interface should have unique IP
+                # For ppc64 arch let's test using one ip only
+                if arch != "ppc64le":
                     net_ip = net_ip + 1
 
-            elif device_type == "STORAGE":
-                # Get the result of "fdisk -l" in guest, and
-                # compare the result with fdisk_list_before.
-                output = session.cmd_output("fdisk -l|grep \"Disk identifier:\"")
-                fdisk_list_after = output.splitlines()
-                if fdisk_list_after == fdisk_list_before:
-                    test.fail("Didn't find the disk attached to guest.")
+        elif device_type == "STORAGE":
+            # Get the result of "fdisk -l" in guest, and
+            # compare the result with fdisk_list_before.
+            output = session.cmd_output("fdisk -l|grep \"Disk identifier:\"")
+            fdisk_list_after = output.splitlines()
+            if fdisk_list_after == fdisk_list_before:
+                test.fail("Didn't find the disk attached to guest.")
 
-            # Execute VM Life-cycle Operation with device pass-through
+    def multiple_reboot(number_of_reboots):
+        for reboot_count in range(number_of_reboots):
+            logging.info("Performing VM Reboot with device pass-through for reboot count : %s", \
+                         reboot_count)
             guest_lifecycle()
+            logging.info("Check device avialablity after VM Reboot for reboot count : %s", \
+                         reboot_count)
+            check_device_status(net_ip, server_ip, netmask)
+
+    try:
+        for itr in range(iteration):
+            check_device_status(net_ip, server_ip, netmask)
+
+        # Execute VM Life-cycle Operation with device pass-through
+        guest_lifecycle()
+
+        # Execute Multiple reboots on VM and check the device persistency
+        multiple_reboot(number_of_reboots)
 
     finally:
         backup_xml.sync()