diff --git a/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough.cfg b/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough.cfg index f0354e0c17a..cba3a9a29cb 100644 --- a/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough.cfg +++ b/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough.cfg @@ -45,3 +45,7 @@ operation = "suspend" - passthrough_shutdown_start: operation = "shutdown" + - passthrough_multiple_reboots: + number_of_reboots = 15 + operation = "reboot" + supported_err = "not supported by the connection driver: virDomainReboot" diff --git a/libvirt/tests/src/multivm_stress/multivm_stress.py b/libvirt/tests/src/multivm_stress/multivm_stress.py index cd74b0c122f..9f7107fa78e 100644 --- a/libvirt/tests/src/multivm_stress/multivm_stress.py +++ b/libvirt/tests/src/multivm_stress/multivm_stress.py @@ -1,8 +1,11 @@ import logging as log +import time from virttest import utils_stress from virttest import error_context from virttest import utils_test +from virttest import virsh +from virttest.libvirt_xml import vm_xml # Using as lower capital is not the best way to do, but this is just a @@ -20,38 +23,210 @@ def run(test, params, env): guest_stress = params.get("guest_stress", "no") == "yes" host_stress = params.get("host_stress", "no") == "yes" - stress_events = params.get("stress_events", "reboot") + stress_events = params.get("stress_events", "") + stress_time = params.get("stress_time", "30") + debug_dir = params.get("debug_dir", "/home/") + dump_options = params.get("dump_options", "--memory-only --bypass-cache") vms = env.get_all_vms() vms_uptime_init = {} + if "reboot" not in stress_events: for vm in vms: vms_uptime_init[vm.name] = vm.uptime() - stress_event = utils_stress.VMStressEvents(params, env) + if guest_stress: + # change the on_crash value to "preserve" when guest crashes + for vm in vms: + logging.debug("Setting on_crash to preserve in %s" % vm.name) + vmxml = vm_xml.VMXML.new_from_inactive_dumpxml(vm.name) + if vm.is_alive(): + vm.destroy(gracefully=False) + vmxml.on_crash = "preserve" + vmxml.sync() + vm.start() + try: utils_test.load_stress("stress_in_vms", params=params, vms=vms) except Exception as err: - test.fail("Error running stress in vms: %s" % err) + test.fail("Error running stress in vms: %s" % str(err)) + if host_stress: if params.get("host_stress_args", ""): params["stress_args"] = params.get("host_stress_args") try: utils_test.load_stress("stress_on_host", params=params) except Exception as err: - test.fail("Error running stress in host: %s" % err) - try: - stress_event.run_threads() - finally: - stress_event.wait_for_threads() - if guest_stress: - utils_test.unload_stress("stress_in_vms", params=params, vms=vms) - if host_stress: - utils_test.unload_stress("stress_on_host", params=params) - if "reboot" not in stress_events: - fail = False + test.fail("Error running stress in host: %s" % str(err)) + + stress_timer = int(stress_time) + fail = False + found_traces = False + failed_vms = [] + login_error_vms = [] + unexpected_reboot_vms = [] + error_message = "" + + if guest_stress: + # check for any call traces in guest dmesg while stress is running + def check_call_traces(vm): + nonlocal stress_timer + found_trace = False + try: + retry_login = True + retry_times = 0 + while retry_login: + try: + retry_login = False + session = vm.wait_for_login(timeout=100) + if vm in login_error_vms: + login_error_vms.remove(vm) + + except Exception: + stress_timer -= 150 + if vm in login_error_vms: + return False + + retry_login = True + retry_times += 1 + if retry_times == 3: + logging.debug("Error in logging into %s" % vm.name) + if vm not in login_error_vms: + login_error_vms.append(vm) + return False + + time.sleep(30) + stress_timer -= 30 + + dmesg = session.cmd("dmesg") + dmesg_level = session.cmd("dmesg -l emerg,alert,crit") + if "Call Trace" in dmesg or len(dmesg_level) >= 1: + logging.debug("Call trace found in %s" % vm.name) + if vm not in failed_vms: + failed_vms.append(vm) + found_trace = True + session.close() + + except Exception as err: + test.error("Error getting dmesg of %s due to %s" % (vm.name, str(err))) + return found_trace + + # run stress for stress_time seconds + logging.debug("Sleeping for %s seconds waiting for stress completion" % stress_time) + stress_time = int(stress_time) + + # check domstate of vms after stress_time + if stress_time < 600: + time.sleep(stress_time) for vm in vms: - if vm.uptime() < vms_uptime_init[vm.name]: - logging.error("Unexpected reboot of VM: %s between test", vm.name) + if vm.state() != "running": + logging.debug("%s state is %s" % (vm.name, vm.state())) + failed_vms.append(vm) fail = True - if fail: - test.fail("Unexpected VM reboot detected") + else: + found_traces = check_call_traces(vm) + if found_traces: + fail = True + time.sleep(2) + + # check domstate of vms for every 5 minutes during stress_time + else: + all_failed = False + number_of_checks = int(stress_time / 600) + delta_time = int(stress_time % 600) + for itr in range(number_of_checks): + if len(failed_vms) == len(vms) or len(login_error_vms) == len(vms): + all_failed = True + break + if stress_timer <= 0: + break + time.sleep(600) + for vm in vms: + if vm.state() != "running": + logging.debug("%s state is %s" % (vm.name, vm.state())) + if vm not in failed_vms: + failed_vms.append(vm) + fail = True + else: + found_traces = check_call_traces(vm) + if found_traces: + fail = True + time.sleep(3) + stress_timer -= 3 + + if delta_time > 0 and stress_timer > 0 and not all_failed: + time.sleep(delta_time) + for vm in vms: + if vm.state() != "running": + logging.debug("%s state is %s" % (vm.name, vm.state())) + if vm not in failed_vms: + failed_vms.append(vm) + fail = True + else: + found_traces = check_call_traces(vm) + if found_traces: + fail = True + time.sleep(3) + stress_timer -= 3 + + # virsh dump the failed vms into debug_dir + if fail: + for vm in failed_vms: + if vm.state() != "shut off": + logging.debug("Dumping %s to debug_dir %s" % (vm.name, debug_dir)) + virsh.dump(vm.name, debug_dir+vm.name+"-core", dump_options, ignore_status=False, debug=True) + logging.debug("Successfully dumped %s as %s-core" % (vm.name, vm.name)) + else: + logging.debug("Cannot dump %s as it is in shut off state" % vm.name) + failed_vms_string = ", ".join(vm.name for vm in failed_vms) + error_message = "Failure in " + failed_vms_string + " while running stress. " + + if login_error_vms: + login_error_vms_string = ", ".join(vm.name for vm in login_error_vms) + error_message += "Login error in " + login_error_vms_string + " while running stress. " + + if len(failed_vms) == len(vms) or len(login_error_vms) == len(vms): + error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS" + test.fail(error_message) + + # run STRESS EVENTS in the remaining stable guests + if len(failed_vms) < len(vms) and len(login_error_vms) < len(vms): + for vm in failed_vms: + if vm in vms: + vms.remove(vm) + for vm in login_error_vms: + if vm in vms: + vms.remove(vm) + + if len(vms) == 0: + error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS" + test.fail(error_message) + + new_vms = ", ".join(vm.name for vm in vms) + try: + if stress_events != "": + logging.debug("Running stress_events in %s" % new_vms) + stress_event = utils_stress.VMStressEvents(params, env, vms) + stress_event.run_threads() + stress_event.wait_for_threads() + + if guest_stress: + utils_test.unload_stress("stress_in_vms", params=params, vms=vms) + + if host_stress: + utils_test.unload_stress("stress_on_host", params=params) + + if "reboot" not in stress_events: + for vm in vms: + if vm.uptime() < vms_uptime_init[vm.name]: + logging.debug("Unexpected reboot of VM: %s between test", vm.name) + unexpected_reboot_vms.append(vm) + unexpected_reboot_vms_string = ", ".join(vm.name for vm in unexpected_reboot_vms) + if unexpected_reboot_vms: + error_message += "Unexpected reboot of guest(s) " + unexpected_reboot_vms_string + ". " + + except Exception as err: + error_message += "Failure running STRESS EVENTS in " + new_vms + " due to" + str(err) + + # check the test status + if error_message: + test.fail(error_message) \ No newline at end of file diff --git a/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough.py b/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough.py index 3fc9d9e9e55..f95c5a120c7 100644 --- a/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough.py +++ b/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough.py @@ -1,5 +1,6 @@ import logging as log import ipaddress +import platform import time from virttest import virsh, virt_vm @@ -39,6 +40,8 @@ def run(test, params, env): i) Reboot. ii) Suspend/Resume. iii) Start/Shutdown. + d). Multiple Reboots: + 1. Checking PCI Device remains persistent across multiple reboots """ def guest_lifecycle(): @@ -93,6 +96,8 @@ def guest_lifecycle(): sriov = ('yes' == params.get("libvirt_pci_SRIOV", 'no')) device_type = params.get("libvirt_pci_device_type", "NIC") vm_vfs = int(params.get("number_vfs", 2)) + number_of_reboots = int(params.get("number_of_reboots", "1")) + arch = platform.machine() pci_dev = None pci_address = None bus_info = [] @@ -174,59 +179,78 @@ def guest_lifecycle(): pci_address = pci_xml.cap.get_address_dict() vmxml.add_hostdev(pci_address) - try: - for itr in range(iteration): - logging.info("Currently executing iteration number: '%s'", itr) - vmxml.sync() - vm.start() - session = vm.wait_for_login() - # The Network configuration is generic irrespective of PF or SRIOV VF - if device_type == "NIC": - if sorted(vm.get_pci_devices()) != sorted(nic_list_before): - logging.debug("Adapter passthroughed to guest successfully") - else: - test.fail("Passthrough adapter not found in guest.") - net_ip = ipaddress.ip_address(net_ip) - nic_list_after = vm.get_pci_devices() - nic_list = list(set(nic_list_after).difference(set(nic_list_before))) - for val in range(len(nic_list)): - bus_info.append(str(nic_list[val]).split(' ', 1)[0]) - nic_list[val] = str(nic_list[val]).split(' ', 1)[0][:-2] - bus_info.sort() - if not sriov: - # check all functions get same iommu group + def check_device_status(net_ip, server_ip, netmask): + logging.info("Currently executing iteration number: '%s'", itr) + vmxml.sync() + vm.start() + session = vm.wait_for_login() + # The Network configuration is generic irrespective of PF or SRIOV VF + if device_type == "NIC": + if sorted(vm.get_pci_devices()) != sorted(nic_list_before): + logging.debug("Adapter passthroughed to guest successfully") + else: + test.fail("Passthrough adapter not found in guest.") + net_ip = ipaddress.ip_address(net_ip) + nic_list_after = vm.get_pci_devices() + nic_list = list(set(nic_list_after).difference(set(nic_list_before))) + for val in range(len(nic_list)): + bus_info.append(str(nic_list[val]).split(' ', 1)[0]) + nic_list[val] = str(nic_list[val]).split(' ', 1)[0][:-2] + bus_info.sort() + if not sriov: + # check all functions get same iommu group + # arch ppc64 gets different iommu group when attached to VM + if arch != "ppc64le": if len(set(nic_list)) != 1: test.fail("Multifunction Device passthroughed but " "functions are in different iommu group") - # ping to server from each function - for val in bus_info: - nic_name = str(utils_misc.get_interface_from_pci_id(val, session)) - session.cmd("ip addr flush dev %s" % nic_name) - session.cmd("ip addr add %s/%s dev %s" - % (net_ip, netmask, nic_name)) - session.cmd("ip link set %s up" % nic_name) - # Pinging using nic_name is having issue, - # hence replaced with IPAddress - s_ping, o_ping = utils_test.ping(server_ip, count=5, - interface=net_ip, timeout=30, - session=session) - logging.info(o_ping) - if s_ping != 0: - err_msg = "Ping test fails, error info: '%s'" - test.fail(err_msg % o_ping) - # Each interface should have unique IP + # ping to server from each function + for val in bus_info: + nic_name = str(utils_misc.get_interface_from_pci_id(val, session)) + session.cmd("ip addr flush dev %s" % nic_name) + session.cmd("ip addr add %s/%s dev %s" + % (net_ip, netmask, nic_name)) + session.cmd("ip link set %s up" % nic_name) + # Pinging using nic_name is having issue, + # hence replaced with IPAddress + s_ping, o_ping = utils_test.ping(server_ip, count=5, + interface=net_ip, timeout=30, + session=session) + logging.info(o_ping) + if s_ping != 0: + err_msg = "Ping test fails, error info: '%s'" + test.fail(err_msg % o_ping) + # Each interface should have unique IP + # For ppc64 arch let's test using one ip only + if arch != "ppc64le": net_ip = net_ip + 1 - elif device_type == "STORAGE": - # Get the result of "fdisk -l" in guest, and - # compare the result with fdisk_list_before. - output = session.cmd_output("fdisk -l|grep \"Disk identifier:\"") - fdisk_list_after = output.splitlines() - if fdisk_list_after == fdisk_list_before: - test.fail("Didn't find the disk attached to guest.") + elif device_type == "STORAGE": + # Get the result of "fdisk -l" in guest, and + # compare the result with fdisk_list_before. + output = session.cmd_output("fdisk -l|grep \"Disk identifier:\"") + fdisk_list_after = output.splitlines() + if fdisk_list_after == fdisk_list_before: + test.fail("Didn't find the disk attached to guest.") - # Execute VM Life-cycle Operation with device pass-through + def multiple_reboot(number_of_reboots): + for reboot_count in range(number_of_reboots): + logging.info("Performing VM Reboot with device pass-through for reboot count : %s", \ + reboot_count) guest_lifecycle() + logging.info("Check device avialablity after VM Reboot for reboot count : %s", \ + reboot_count) + check_device_status(net_ip, server_ip, netmask) + + try: + for itr in range(iteration): + check_device_status(net_ip, server_ip, netmask) + + # Execute VM Life-cycle Operation with device pass-through + guest_lifecycle() + + # Execute Multiple reboots on VM and check the device persistency + multiple_reboot(number_of_reboots) finally: backup_xml.sync()