lop-devops · TasmiyaNalatwad · Feb 14, 2025 · Mar 5, 2025
diff --git a/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough.cfg b/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough.cfg
@@ -45,3 +45,7 @@
             operation = "suspend"
         - passthrough_shutdown_start:
             operation = "shutdown"
+        - passthrough_multiple_reboots:
+            number_of_reboots = 15
+            operation = "reboot"
+            supported_err = "not supported by the connection driver: virDomainReboot"
diff --git a/libvirt/tests/src/multivm_stress/multivm_stress.py b/libvirt/tests/src/multivm_stress/multivm_stress.py
@@ -1,8 +1,11 @@
 import logging as log
+import time
 
 from virttest import utils_stress
 from virttest import error_context
 from virttest import utils_test
+from virttest import virsh
+from virttest.libvirt_xml import vm_xml
 
 
 # Using as lower capital is not the best way to do, but this is just a
@@ -20,38 +23,210 @@ def run(test, params, env):
 
     guest_stress = params.get("guest_stress", "no") == "yes"
     host_stress = params.get("host_stress", "no") == "yes"
-    stress_events = params.get("stress_events", "reboot")
+    stress_events = params.get("stress_events", "")
+    stress_time = params.get("stress_time", "30")
+    debug_dir = params.get("debug_dir", "/home/")
+    dump_options = params.get("dump_options", "--memory-only --bypass-cache")
     vms = env.get_all_vms()
     vms_uptime_init = {}
+
     if "reboot" not in stress_events:
         for vm in vms:
             vms_uptime_init[vm.name] = vm.uptime()
-    stress_event = utils_stress.VMStressEvents(params, env)
+
     if guest_stress:
+        # change the on_crash value to "preserve" when guest crashes
+        for vm in vms:
+            logging.debug("Setting on_crash to preserve in %s" % vm.name)
+            vmxml = vm_xml.VMXML.new_from_inactive_dumpxml(vm.name)
+            if vm.is_alive():
+                vm.destroy(gracefully=False)
+            vmxml.on_crash = "preserve"
+            vmxml.sync()
+            vm.start()
+
         try:
             utils_test.load_stress("stress_in_vms", params=params, vms=vms)
         except Exception as err:
-            test.fail("Error running stress in vms: %s" % err)
+            test.fail("Error running stress in vms: %s" % str(err))
+
     if host_stress:
         if params.get("host_stress_args", ""):
             params["stress_args"] = params.get("host_stress_args")
         try:
             utils_test.load_stress("stress_on_host", params=params)
         except Exception as err:
-            test.fail("Error running stress in host: %s" % err)
-    try:
-        stress_event.run_threads()
-    finally:
-        stress_event.wait_for_threads()
-        if guest_stress:
-            utils_test.unload_stress("stress_in_vms", params=params, vms=vms)
-        if host_stress:
-            utils_test.unload_stress("stress_on_host", params=params)
-        if "reboot" not in stress_events:
-            fail = False
+            test.fail("Error running stress in host: %s" % str(err))
+
+    stress_timer = int(stress_time)
+    fail = False
+    found_traces = False
+    failed_vms = []
+    login_error_vms = []
+    unexpected_reboot_vms = []
+    error_message = ""
+
+    if guest_stress:
+        # check for any call traces in guest dmesg while stress is running
+        def check_call_traces(vm):
+            nonlocal stress_timer
+            found_trace = False
+            try:
+                retry_login = True
+                retry_times = 0
+                while retry_login:
+                    try:
+                        retry_login = False
+                        session = vm.wait_for_login(timeout=100)
+                        if vm in login_error_vms:
+                            login_error_vms.remove(vm)
+
+                    except Exception:
+                        stress_timer -= 150
+                        if vm in login_error_vms:
+                            return False
+
+                        retry_login = True
+                        retry_times += 1
+                        if retry_times == 3:
+                            logging.debug("Error in logging into %s" % vm.name)
+                            if vm not in login_error_vms:
+                                login_error_vms.append(vm)
+                            return False
+
+                        time.sleep(30)
+                        stress_timer -= 30
+
+                dmesg = session.cmd("dmesg")
+                dmesg_level = session.cmd("dmesg -l emerg,alert,crit")
+                if "Call Trace" in dmesg or len(dmesg_level) >= 1:
+                    logging.debug("Call trace found in %s" % vm.name)
+                    if vm not in failed_vms:
+                        failed_vms.append(vm)
+                    found_trace = True
+                session.close()
+
+            except Exception as err:
+                test.error("Error getting dmesg of %s due to %s" % (vm.name, str(err)))
+            return found_trace
+
+        # run stress for stress_time seconds
+        logging.debug("Sleeping for %s seconds waiting for stress completion" % stress_time)
+        stress_time = int(stress_time)
+
+        # check domstate of vms after stress_time
+        if stress_time < 600:
+            time.sleep(stress_time)
             for vm in vms:
-                if vm.uptime() < vms_uptime_init[vm.name]:
-                    logging.error("Unexpected reboot of VM: %s between test", vm.name)
+                if vm.state() != "running":
+                    logging.debug("%s state is %s" % (vm.name, vm.state()))
+                    failed_vms.append(vm)
                     fail = True
-            if fail:
-                test.fail("Unexpected VM reboot detected")
+                else:
+                    found_traces = check_call_traces(vm)
+                    if found_traces:
+                        fail = True
+                    time.sleep(2)
+
+        # check domstate of vms for every 5 minutes during stress_time
+        else:
+            all_failed = False
+            number_of_checks = int(stress_time / 600)
+            delta_time = int(stress_time % 600)
+            for itr in range(number_of_checks):
+                if len(failed_vms) == len(vms) or len(login_error_vms) == len(vms):
+                    all_failed = True
+                    break
+                if stress_timer <= 0:
+                    break
+                time.sleep(600)
+                for vm in vms:
+                    if vm.state() != "running":
+                        logging.debug("%s state is %s" % (vm.name, vm.state()))
+                        if vm not in failed_vms:
+                            failed_vms.append(vm)
+                        fail = True
+                    else:
+                        found_traces = check_call_traces(vm)
+                        if found_traces:
+                            fail = True
+                        time.sleep(3)
+                        stress_timer -= 3
+
+            if delta_time > 0 and stress_timer > 0 and not all_failed:
+                time.sleep(delta_time)
+                for vm in vms:
+                    if vm.state() != "running":
+                        logging.debug("%s state is %s" % (vm.name, vm.state()))
+                        if vm not in failed_vms:
+                            failed_vms.append(vm)
+                        fail = True
+                    else:
+                        found_traces = check_call_traces(vm)
+                        if found_traces:
+                            fail = True
+                        time.sleep(3)
+                        stress_timer -= 3
+
+        # virsh dump the failed vms into debug_dir
+        if fail:
+            for vm in failed_vms:
+                if vm.state() != "shut off":
+                    logging.debug("Dumping %s to debug_dir %s" % (vm.name, debug_dir))
+                    virsh.dump(vm.name, debug_dir+vm.name+"-core", dump_options, ignore_status=False, debug=True)
+                    logging.debug("Successfully dumped %s as %s-core" % (vm.name, vm.name))
+                else:
+                    logging.debug("Cannot dump %s as it is in shut off state" % vm.name)
+            failed_vms_string = ", ".join(vm.name for vm in failed_vms)
+            error_message = "Failure in " + failed_vms_string + " while running stress. "
+
+        if login_error_vms:
+            login_error_vms_string = ", ".join(vm.name for vm in login_error_vms)
+            error_message += "Login error in " + login_error_vms_string + " while running stress. "
+
+        if len(failed_vms) == len(vms) or len(login_error_vms) == len(vms):
+            error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS"
+            test.fail(error_message)
+
+    # run STRESS EVENTS in the remaining stable guests
+    if len(failed_vms) < len(vms) and len(login_error_vms) < len(vms):
+        for vm in failed_vms:
+            if vm in vms:
+                vms.remove(vm)
+        for vm in login_error_vms:
+            if vm in vms:
+                vms.remove(vm)
+
+        if len(vms) == 0:
+            error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS"
+            test.fail(error_message)
+
+        new_vms = ", ".join(vm.name for vm in vms)
+        try:
+            if stress_events != "":
+                logging.debug("Running stress_events in %s" % new_vms)
+                stress_event = utils_stress.VMStressEvents(params, env, vms)
+                stress_event.run_threads()
+                stress_event.wait_for_threads()
+
+            if guest_stress:
+                utils_test.unload_stress("stress_in_vms", params=params, vms=vms)
+
+            if host_stress:
+                utils_test.unload_stress("stress_on_host", params=params)
+
+            if "reboot" not in stress_events:
+                for vm in vms:
+                    if vm.uptime() < vms_uptime_init[vm.name]:
+                        logging.debug("Unexpected reboot of VM: %s between test", vm.name)
+                        unexpected_reboot_vms.append(vm)
+                unexpected_reboot_vms_string = ", ".join(vm.name for vm in unexpected_reboot_vms)
+                if unexpected_reboot_vms:
+                    error_message += "Unexpected reboot of guest(s) " + unexpected_reboot_vms_string + ". "
+
+        except Exception as err:
+            error_message += "Failure running STRESS EVENTS in " + new_vms + " due to" + str(err)
+
+    # check the test status
+    if error_message:
+        test.fail(error_message)
diff --git a/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough.py b/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough.py
@@ -1,5 +1,6 @@
 import logging as log
 import ipaddress
+import platform
 import time
 
 from virttest import virsh, virt_vm
@@ -39,6 +40,8 @@ def run(test, params, env):
             i) Reboot.
             ii) Suspend/Resume.
             iii) Start/Shutdown.
+    d). Multiple Reboots:
+        1. Checking PCI Device remains persistent across multiple reboots
     """
 
     def guest_lifecycle():
@@ -93,6 +96,8 @@ def guest_lifecycle():
     sriov = ('yes' == params.get("libvirt_pci_SRIOV", 'no'))
     device_type = params.get("libvirt_pci_device_type", "NIC")
     vm_vfs = int(params.get("number_vfs", 2))
+    number_of_reboots = int(params.get("number_of_reboots", "1"))
+    arch = platform.machine()
     pci_dev = None
     pci_address = None
     bus_info = []
@@ -174,59 +179,78 @@ def guest_lifecycle():
         pci_address = pci_xml.cap.get_address_dict()
         vmxml.add_hostdev(pci_address)
 
-    try:
-        for itr in range(iteration):
-            logging.info("Currently executing iteration number: '%s'", itr)
-            vmxml.sync()
-            vm.start()
-            session = vm.wait_for_login()
-            # The Network configuration is generic irrespective of PF or SRIOV VF
-            if device_type == "NIC":
-                if sorted(vm.get_pci_devices()) != sorted(nic_list_before):
-                    logging.debug("Adapter passthroughed to guest successfully")
-                else:
-                    test.fail("Passthrough adapter not found in guest.")
-                net_ip = ipaddress.ip_address(net_ip)
-                nic_list_after = vm.get_pci_devices()
-                nic_list = list(set(nic_list_after).difference(set(nic_list_before)))
-                for val in range(len(nic_list)):
-                    bus_info.append(str(nic_list[val]).split(' ', 1)[0])
-                    nic_list[val] = str(nic_list[val]).split(' ', 1)[0][:-2]
-                bus_info.sort()
-                if not sriov:
-                    # check all functions get same iommu group
+    def check_device_status(net_ip, server_ip, netmask):
+        logging.info("Currently executing iteration number: '%s'", itr)
+        vmxml.sync()
+        vm.start()
+        session = vm.wait_for_login()
+        # The Network configuration is generic irrespective of PF or SRIOV VF
+        if device_type == "NIC":
+            if sorted(vm.get_pci_devices()) != sorted(nic_list_before):
+                logging.debug("Adapter passthroughed to guest successfully")
+            else:
+                test.fail("Passthrough adapter not found in guest.")
+            net_ip = ipaddress.ip_address(net_ip)
+            nic_list_after = vm.get_pci_devices()
+            nic_list = list(set(nic_list_after).difference(set(nic_list_before)))
+            for val in range(len(nic_list)):
+                bus_info.append(str(nic_list[val]).split(' ', 1)[0])
+                nic_list[val] = str(nic_list[val]).split(' ', 1)[0][:-2]
+            bus_info.sort()
+            if not sriov:
+                # check all functions get same iommu group
+                # arch ppc64 gets different iommu group when attached to VM
+                if arch != "ppc64le":
                     if len(set(nic_list)) != 1:
                         test.fail("Multifunction Device passthroughed but "
                                   "functions are in different iommu group")
-                # ping to server from each function
-                for val in bus_info:
-                    nic_name = str(utils_misc.get_interface_from_pci_id(val, session))
-                    session.cmd("ip addr flush dev %s" % nic_name)
-                    session.cmd("ip addr add %s/%s dev %s"
-                                % (net_ip, netmask, nic_name))
-                    session.cmd("ip link set %s up" % nic_name)
-                    # Pinging using nic_name is having issue,
-                    # hence replaced with IPAddress
-                    s_ping, o_ping = utils_test.ping(server_ip, count=5,
-                                                     interface=net_ip, timeout=30,
-                                                     session=session)
-                    logging.info(o_ping)
-                    if s_ping != 0:
-                        err_msg = "Ping test fails, error info: '%s'"
-                        test.fail(err_msg % o_ping)
-                    # Each interface should have unique IP
+            # ping to server from each function
+            for val in bus_info:
+                nic_name = str(utils_misc.get_interface_from_pci_id(val, session))
+                session.cmd("ip addr flush dev %s" % nic_name)
+                session.cmd("ip addr add %s/%s dev %s"
+                            % (net_ip, netmask, nic_name))
+                session.cmd("ip link set %s up" % nic_name)
+                # Pinging using nic_name is having issue,
+                # hence replaced with IPAddress
+                s_ping, o_ping = utils_test.ping(server_ip, count=5,
+                                                 interface=net_ip, timeout=30,
+                                                 session=session)
+                logging.info(o_ping)
+                if s_ping != 0:
+                    err_msg = "Ping test fails, error info: '%s'"
+                    test.fail(err_msg % o_ping)
+                # Each interface should have unique IP
+                # For ppc64 arch let's test using one ip only
+                if arch != "ppc64le":
                     net_ip = net_ip + 1
 
-            elif device_type == "STORAGE":
-                # Get the result of "fdisk -l" in guest, and
-                # compare the result with fdisk_list_before.
-                output = session.cmd_output("fdisk -l|grep \"Disk identifier:\"")
-                fdisk_list_after = output.splitlines()
-                if fdisk_list_after == fdisk_list_before:
-                    test.fail("Didn't find the disk attached to guest.")
+        elif device_type == "STORAGE":
+            # Get the result of "fdisk -l" in guest, and
+            # compare the result with fdisk_list_before.
+            output = session.cmd_output("fdisk -l|grep \"Disk identifier:\"")
+            fdisk_list_after = output.splitlines()
+            if fdisk_list_after == fdisk_list_before:
+                test.fail("Didn't find the disk attached to guest.")
 
-            # Execute VM Life-cycle Operation with device pass-through
+    def multiple_reboot(number_of_reboots):
+        for reboot_count in range(number_of_reboots):
+            logging.info("Performing VM Reboot with device pass-through for reboot count : %s", \
+                         reboot_count)
             guest_lifecycle()
+            logging.info("Check device avialablity after VM Reboot for reboot count : %s", \
+                         reboot_count)
+            check_device_status(net_ip, server_ip, netmask)
+
+    try:
+        for itr in range(iteration):
+            check_device_status(net_ip, server_ip, netmask)
+
+        # Execute VM Life-cycle Operation with device pass-through
+        guest_lifecycle()
+
+        # Execute Multiple reboots on VM and check the device persistency
+        multiple_reboot(number_of_reboots)
 
     finally:
         backup_xml.sync()