diff --git a/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough_eeh.cfg b/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough_eeh.cfg new file mode 100644 index 00000000000..9783ae069ca --- /dev/null +++ b/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough_eeh.cfg @@ -0,0 +1,51 @@ +- libvirt_pci_passthrough_eeh: + virt_test_type = libvirt + provider = io-github-autotest-libvirt + type = libvirt_pci_passthrough_eeh + iteration_val = 2 + no s390-virtio + variants: + - Normal_passthrough: + libvirt_pci_SRIOV = no + # Removing SRIOV as of now, as curently we don't test this + # - SRIOV: + # libvirt_pci_SRIOV = yes + # vf_filter = "Virtual Function" + # Enter the no.of Virtual Function's to be cr + # for each Physical Function. + # number_vfs = 4 + variants: + - PCI_NIC: + libvirt_pci_device_type = "NIC" + # Please enter the PCI device label for + # a network device. We will attach this + # device to guest. Then this network device + # will be unavailable on host. + # E.g: 0000:05:00.0 + libvirt_pci_net_dev_label = "ENTER.YOUR.PCI.LABEL" + libvirt_pci_net_dev_name = "ENTER.YOUR.DEVICE.NAME" + # Please enter the ip what is used by the device + # you are going to attach to guest. + libvirt_pci_net_ip = "ENTER.YOUR.IP" + # Please enter a available ip from the net device. + # We need to ping it after attaching pci device + # to guest to verify this device works well in guest. + libvirt_pci_server_ip = "ENTER.YOUR.SERVER.IP" + # Enter netmask in CIDR notation + libvirt_pci_net_mask = "ENTER.YOUR.NETMASK" + # enter timeout value + timeout = "ENTER.YOUR.TIMEOUT" + model = "ENTER.YOUR.DEVICE.MODEL" + index = "1" + - PCI_STORAGE: + libvirt_pci_device_type = "STORAGE" + # Please enter the PCI device label for + # a storage device. We will attach this + # device to guest. + # E.g: pci_0000_0d_00_0 + libvirt_pci_storage_dev_label = "ENTER.YOUR.PCI.LABEL" + variants: + - passthrough_eeh_guest: + eeh_guest = "yes" + - passthrough_suspend_host: + eeh_host = "yes" diff --git a/libvirt/tests/src/multivm_stress/multivm_stress.py b/libvirt/tests/src/multivm_stress/multivm_stress.py index cd74b0c122f..9f7107fa78e 100644 --- a/libvirt/tests/src/multivm_stress/multivm_stress.py +++ b/libvirt/tests/src/multivm_stress/multivm_stress.py @@ -1,8 +1,11 @@ import logging as log +import time from virttest import utils_stress from virttest import error_context from virttest import utils_test +from virttest import virsh +from virttest.libvirt_xml import vm_xml # Using as lower capital is not the best way to do, but this is just a @@ -20,38 +23,210 @@ def run(test, params, env): guest_stress = params.get("guest_stress", "no") == "yes" host_stress = params.get("host_stress", "no") == "yes" - stress_events = params.get("stress_events", "reboot") + stress_events = params.get("stress_events", "") + stress_time = params.get("stress_time", "30") + debug_dir = params.get("debug_dir", "/home/") + dump_options = params.get("dump_options", "--memory-only --bypass-cache") vms = env.get_all_vms() vms_uptime_init = {} + if "reboot" not in stress_events: for vm in vms: vms_uptime_init[vm.name] = vm.uptime() - stress_event = utils_stress.VMStressEvents(params, env) + if guest_stress: + # change the on_crash value to "preserve" when guest crashes + for vm in vms: + logging.debug("Setting on_crash to preserve in %s" % vm.name) + vmxml = vm_xml.VMXML.new_from_inactive_dumpxml(vm.name) + if vm.is_alive(): + vm.destroy(gracefully=False) + vmxml.on_crash = "preserve" + vmxml.sync() + vm.start() + try: utils_test.load_stress("stress_in_vms", params=params, vms=vms) except Exception as err: - test.fail("Error running stress in vms: %s" % err) + test.fail("Error running stress in vms: %s" % str(err)) + if host_stress: if params.get("host_stress_args", ""): params["stress_args"] = params.get("host_stress_args") try: utils_test.load_stress("stress_on_host", params=params) except Exception as err: - test.fail("Error running stress in host: %s" % err) - try: - stress_event.run_threads() - finally: - stress_event.wait_for_threads() - if guest_stress: - utils_test.unload_stress("stress_in_vms", params=params, vms=vms) - if host_stress: - utils_test.unload_stress("stress_on_host", params=params) - if "reboot" not in stress_events: - fail = False + test.fail("Error running stress in host: %s" % str(err)) + + stress_timer = int(stress_time) + fail = False + found_traces = False + failed_vms = [] + login_error_vms = [] + unexpected_reboot_vms = [] + error_message = "" + + if guest_stress: + # check for any call traces in guest dmesg while stress is running + def check_call_traces(vm): + nonlocal stress_timer + found_trace = False + try: + retry_login = True + retry_times = 0 + while retry_login: + try: + retry_login = False + session = vm.wait_for_login(timeout=100) + if vm in login_error_vms: + login_error_vms.remove(vm) + + except Exception: + stress_timer -= 150 + if vm in login_error_vms: + return False + + retry_login = True + retry_times += 1 + if retry_times == 3: + logging.debug("Error in logging into %s" % vm.name) + if vm not in login_error_vms: + login_error_vms.append(vm) + return False + + time.sleep(30) + stress_timer -= 30 + + dmesg = session.cmd("dmesg") + dmesg_level = session.cmd("dmesg -l emerg,alert,crit") + if "Call Trace" in dmesg or len(dmesg_level) >= 1: + logging.debug("Call trace found in %s" % vm.name) + if vm not in failed_vms: + failed_vms.append(vm) + found_trace = True + session.close() + + except Exception as err: + test.error("Error getting dmesg of %s due to %s" % (vm.name, str(err))) + return found_trace + + # run stress for stress_time seconds + logging.debug("Sleeping for %s seconds waiting for stress completion" % stress_time) + stress_time = int(stress_time) + + # check domstate of vms after stress_time + if stress_time < 600: + time.sleep(stress_time) for vm in vms: - if vm.uptime() < vms_uptime_init[vm.name]: - logging.error("Unexpected reboot of VM: %s between test", vm.name) + if vm.state() != "running": + logging.debug("%s state is %s" % (vm.name, vm.state())) + failed_vms.append(vm) fail = True - if fail: - test.fail("Unexpected VM reboot detected") + else: + found_traces = check_call_traces(vm) + if found_traces: + fail = True + time.sleep(2) + + # check domstate of vms for every 5 minutes during stress_time + else: + all_failed = False + number_of_checks = int(stress_time / 600) + delta_time = int(stress_time % 600) + for itr in range(number_of_checks): + if len(failed_vms) == len(vms) or len(login_error_vms) == len(vms): + all_failed = True + break + if stress_timer <= 0: + break + time.sleep(600) + for vm in vms: + if vm.state() != "running": + logging.debug("%s state is %s" % (vm.name, vm.state())) + if vm not in failed_vms: + failed_vms.append(vm) + fail = True + else: + found_traces = check_call_traces(vm) + if found_traces: + fail = True + time.sleep(3) + stress_timer -= 3 + + if delta_time > 0 and stress_timer > 0 and not all_failed: + time.sleep(delta_time) + for vm in vms: + if vm.state() != "running": + logging.debug("%s state is %s" % (vm.name, vm.state())) + if vm not in failed_vms: + failed_vms.append(vm) + fail = True + else: + found_traces = check_call_traces(vm) + if found_traces: + fail = True + time.sleep(3) + stress_timer -= 3 + + # virsh dump the failed vms into debug_dir + if fail: + for vm in failed_vms: + if vm.state() != "shut off": + logging.debug("Dumping %s to debug_dir %s" % (vm.name, debug_dir)) + virsh.dump(vm.name, debug_dir+vm.name+"-core", dump_options, ignore_status=False, debug=True) + logging.debug("Successfully dumped %s as %s-core" % (vm.name, vm.name)) + else: + logging.debug("Cannot dump %s as it is in shut off state" % vm.name) + failed_vms_string = ", ".join(vm.name for vm in failed_vms) + error_message = "Failure in " + failed_vms_string + " while running stress. " + + if login_error_vms: + login_error_vms_string = ", ".join(vm.name for vm in login_error_vms) + error_message += "Login error in " + login_error_vms_string + " while running stress. " + + if len(failed_vms) == len(vms) or len(login_error_vms) == len(vms): + error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS" + test.fail(error_message) + + # run STRESS EVENTS in the remaining stable guests + if len(failed_vms) < len(vms) and len(login_error_vms) < len(vms): + for vm in failed_vms: + if vm in vms: + vms.remove(vm) + for vm in login_error_vms: + if vm in vms: + vms.remove(vm) + + if len(vms) == 0: + error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS" + test.fail(error_message) + + new_vms = ", ".join(vm.name for vm in vms) + try: + if stress_events != "": + logging.debug("Running stress_events in %s" % new_vms) + stress_event = utils_stress.VMStressEvents(params, env, vms) + stress_event.run_threads() + stress_event.wait_for_threads() + + if guest_stress: + utils_test.unload_stress("stress_in_vms", params=params, vms=vms) + + if host_stress: + utils_test.unload_stress("stress_on_host", params=params) + + if "reboot" not in stress_events: + for vm in vms: + if vm.uptime() < vms_uptime_init[vm.name]: + logging.debug("Unexpected reboot of VM: %s between test", vm.name) + unexpected_reboot_vms.append(vm) + unexpected_reboot_vms_string = ", ".join(vm.name for vm in unexpected_reboot_vms) + if unexpected_reboot_vms: + error_message += "Unexpected reboot of guest(s) " + unexpected_reboot_vms_string + ". " + + except Exception as err: + error_message += "Failure running STRESS EVENTS in " + new_vms + " due to" + str(err) + + # check the test status + if error_message: + test.fail(error_message) \ No newline at end of file diff --git a/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough_eeh.py b/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough_eeh.py new file mode 100644 index 00000000000..82c2d862d82 --- /dev/null +++ b/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough_eeh.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Author: Tasmiya.Nalatwad +# EEH Test on pci passthrough devices + +import logging as log +import aexpect +import platform +import time + +from avocado.utils import process + +from virttest import virsh +from virttest.libvirt_xml.vm_xml import VMXML +from virttest.libvirt_xml.nodedev_xml import NodedevXML +from virttest.test_setup import PciAssignable +from virttest import utils_test, utils_misc +from virttest.libvirt_xml.devices.controller import Controller +from virttest import utils_package +from virttest import libvirt_version + +# Using as lower capital is not the best way to do, but this is just a +# workaround to avoid changing the entire file. +logging = log.getLogger('avocado.' + __name__) + + +def run(test, params, env): + """ + Test EEH functionality on PCI Passthrough device + of a libvirt guest + a). NIC: + 1. Get params. + 2. Get the pci device function. + 3. Start guest + 4. prepare device xml to be attached + 5. hotplug the device + 6. check device hotplugged or not + 7. Ping to server_ip from guest + 8. Get location code for the pci device + 9. Trigger Error Injection on pci device + 10. Check Error Injection recovery on the device + 11. check device availability inside guest + b). STORAGE: + 1. Get params. + 2. Get the pci device function. + 3. Start guest + 4. prepare device xml to be attached + 5. hotplug the device + 6. check device hotplugged or not + 7. check STORAGE device inside guest + 8. Get location code for the pci device + 9. mount file (/dev/nvme*) on /mnt + 9. Trigger EEH on pci device + 10. Check EEH recovery on the device + 11. check device availability inside guest + """ + # get the params from params + vm_name = params.get("main_vm") + vm = env.get_vm(vm_name) + device_name = params.get("libvirt_pci_net_dev_name", "ENTER_YOUR.DEV.NAME") + device_type = params.get("libvirt_pci_device_type", "NIC") + pci_id = params.get("libvirt_pci_net_dev_label", "ENTER_YOUR.DEV.LABEL") + net_ip = params.get("libvirt_pci_net_ip", "ENTER_YOUR.IP") + server_ip = params.get("libvirt_pci_server_ip", + "ENTER_YOUR.SERVER.IP") + netmask = params.get("libvirt_pci_net_mask", "ENTER_YOUR.MASK") + timeout = int(params.get("timeout", "ENTER_YOUR.TIMEOUT.VALUE")) + + vmxml = VMXML.new_from_inactive_dumpxml(vm_name) + backup_xml = vmxml.copy() + devices = vmxml.get_devices() + pci_devs = [] + + cntlr_index = params.get("index", "1") + cntlr_model = params.get("model", "pci-root") + cntlr_type = "pci" + + controllers = vmxml.get_controllers(cntlr_type, cntlr_model) + index_list = [] + for controller in controllers: + index_value = controller.get("index") + if index_value is not None: + index_list.append(int(index_value)) + if index_list: + next_index = max(index_list) + 1 + else: + next_index = int(cntlr_index) + + controller = Controller("controller") + controller.type = cntlr_type + controller.index = str(next_index) + controller.model = cntlr_model + + devices.append(controller) + vmxml.set_devices(devices) + vmxml.sync() + if not vm.is_alive(): + vm.start() + session = vm.wait_for_login() + if not utils_package.package_install(["ppc64-diag", + "librtas", "powerpc-utils"], + session, 360): + test.cancel('Fail on dependencies installing') + + output = session.cmd_output("ip link") + logging.debug("checking for output - %s", output) + nic_list_before = str(output.splitlines()) + logging.debug("nic_list before hotplug %s", nic_list_before) + obj = PciAssignable() + # get all functions id's + print("Tasmiya PCI_ID : %s" % pci_id) + pci_ids = obj.get_same_group_devs(pci_id) + for val in pci_ids: + temp = val.replace(":", "_") + pci_devs.extend(["pci_"+temp]) + pci_val = pci_devs[0].replace(".", "_") + if device_type == "NIC": + if not vm.is_alive(): + vm.start() + session = vm.wait_for_login() + nic_list_before = vm.get_pci_devices() + pci_xml = NodedevXML.new_from_dumpxml(pci_val) + pci_address = pci_xml.cap.get_address_dict() + dev = VMXML.get_device_class('hostdev')() + dev.mode = 'subsystem' + dev.type = 'pci' + dev.managed = 'no' + dev.source = dev.new_source(**pci_address) + arch = platform.machine() + ioa_system_info = params.get("ioa_system_info", "ioa-bus-error") + func = params.get("function", "6") + max_freeze = params.get("max_freeze", "5") + eeh_guest = params.get("eeh_guest", "no") + eeh_host = params.get("eeh_host", "no") + + def test_ping(): + try: + output = session.cmd_output('lspci -nn | grep "%s"' % device_name) + nic_id = str(output).split(' ', 1)[0] + nic_name = str(utils_misc.get_interface_from_pci_id(nic_id, + session)) + session.cmd("ip addr flush dev %s" % nic_name) + session.cmd("ip addr add %s/%s dev %s" + % (net_ip, netmask, nic_name)) + session.cmd("ip link set %s up" % nic_name) + s_ping, o_ping = utils_test.ping(server_ip, count=5, + interface=net_ip, timeout=30, + session=session) + logging.info(s_ping) + logging.info(o_ping) + if s_ping: + test.fail("Ping test failed") + except aexpect.ShellError as detail: + test.error("Succeed to set ip on guest, but failed " + "to bring up interface.\n" + "Detail: %s." % detail) + + def detach_device(pci_devs, pci_ids): + # detaching the device from host + for pci_value, pci_node in zip(pci_devs, pci_ids): + pci_value = pci_value.replace(".", "_") + cmd = "lspci -ks %s | grep 'Kernel driver in use' |\ + awk '{print $5}'" % pci_node + driver_name = process.run(cmd, shell=True).stdout_text.strip() + if driver_name == "vfio-pci": + logging.debug("device already detached") + else: + if virsh.nodedev_detach(pci_value).exit_status: + test.error("Hostdev node detach failed") + driver_name = process.run(cmd, shell=True).stdout_text.strip() + if driver_name != "vfio-pci": + test.error("driver bind failed after detach") + + def reattach_device(pci_devs, pci_ids): + # reattach the device to host + for pci_value, pci_node in zip(pci_devs, pci_ids): + pci_value = pci_value.replace(".", "_") + cmd = "lspci -ks %s | grep 'Kernel driver in use' |\ + awk '{print $5}'" % pci_node + driver_name = process.run(cmd, shell=True).stdout_text.strip() + if driver_name != "vfio-pci": + logging.debug("device already attached") + else: + if virsh.nodedev_reattach(pci_value).exit_status: + test.fail("Hostdev node reattach failed") + driver_name = process.run(cmd, shell=True).stdout_text.strip() + if driver_name == "vfio-pci": + test.error("driver bind failed after reattach") + + def check_attach_pci(): + session = vm.wait_for_login() + output = session.cmd_output("ip link") + nic_list_after = str(output.splitlines()) + logging.debug(nic_list_after) + return nic_list_after != nic_list_before + + def device_hotplug(): + if arch == "ppc64le": + if libvirt_version.version_compare(3, 10, 0): + detach_device(pci_devs, pci_ids) + else: + if not libvirt_version.version_compare(3, 10, 0): + detach_device(pci_devs, pci_ids) + # attach the device in hotplug mode + result = virsh.attach_device(vm_name, dev.xml, + flagstr="--live", debug=True) + if result.exit_status: + test.error(result.stdout.strip()) + else: + logging.debug(result.stdout.strip()) + if not utils_misc.wait_for(check_attach_pci, timeout): + test.fail("timeout value is not sufficient") + + # detach hot plugged device + def device_hotunplug(): + result = virsh.detach_device(vm_name, dev.xml, + flagstr="--live", debug=True) + if result.exit_status: + test.fail(result.stdout.strip()) + else: + logging.debug(result.stdout.strip()) + # Fix me + # the purpose of waiting here is after detach the device from + # guest it need time to perform any other operation on the device + time.sleep(timeout) + if not libvirt_version.version_compare(3, 10, 0): + pci_devs.sort() + reattach_device(pci_devs, pci_ids) + + def check_device(): + # Get the result of "fdisk -l" in guest, and + # compare the result with fdisk_list_before. + output = session.cmd_output("fdisk -l|grep \"Disk identifier:\"") + fdisk_list_after = output.splitlines() + if fdisk_list_after == fdisk_list_before: + test.fail("Didn't find the disk attached to guest.") + + def get_new_dmesg_logs(last_dmesg_line): + """ + Fetch new `dmesg` logs since the last pointer. + """ + cmd = "dmesg" + res_status, res_output = session.cmd_status_output(cmd) + if res_status != 0: + test.fail("Failed to fetch dmesg logs, status: %s, output: %s" % \ + (res_status, res_output)) + logs = res_output.splitlines() + if last_dmesg_line is not None: + # Get logs after the last known line + try: + idx = logs.index(last_dmesg_line) + new_logs = logs[idx + 1:] + except ValueError: + new_logs = logs + else: + new_logs = logs + return new_logs, logs[-1] if logs else None + + def test_eeh_nic(): + cmd = "echo %s > /sys/kernel/debug/powerpc/eeh_max_freezes" % max_freeze + process.run(cmd, shell=True) + loc_code = str(utils_misc.get_location_code(pci_id)) + num_of_miss = 0 + last_dmesg_line = None # Initialize the last dmesg line pointer + pass_hit = 0 + for num_of_hit in range(int(max_freeze)): + if num_of_miss < 4: + # Inject EEH error using below command + eeh_cmd = "errinjct %s -f %s -p %s -m 0" % (ioa_system_info, func, loc_code) + if eeh_guest == "yes": + session.cmd(eeh_cmd) + if eeh_host == "yes": + process.run(eeh_cmd, shell=True) + is_hit, last_dmesg_line = check_eeh_hit(last_dmesg_line) + if not is_hit: + num_of_miss += 1 + if num_of_hit >= 1 and pass_hit != 0: + test.fail("Failed to inject EEH after %s sucessfull attempt for %s. \ + Please check dmesg logs" % (pass_hit, pci_ids)) + logging.debug("PCI Device %s EEH hit failed" % pci_ids) + continue + is_recovered, last_dmesg_line = check_eeh_pci_device_recovery(last_dmesg_line) + if not is_recovered: + test.fail("PCI device %s recovery failed after %s EEH" % (pci_ids, num_of_hit)) + else: + test.fail("Failed to Inject EEH for 5 times") + pass_hit += 1 + is_removed, last_dmesg_line = check_eeh_removed(last_dmesg_line) + if is_removed: + logging.debug("PCI Device %s removed successfully" % pci_ids) + else: + test.fail("PCI Device %s failed to permanetly disable after max hit" % pci_ids) + + def check_eeh_pci_device_recovery(last_dmesg_line): + """ + Check if the pci device is recovered successfully after injecting EEH + """ + tries = 60 + for _ in range(0, tries): + logs, last_dmesg_line = get_new_dmesg_logs(last_dmesg_line) + if any('permanent failure' in log for log in logs): + logging.debug("TEST WILL FAIL AS PERMANENT FAILURE IS SEEN") + elif any('EEH: Recovery successful.' in log for log in logs): + logging.debug("waiting for pci device to recover %s", pci_ids) + break + time.sleep(5) + else: + logging.debug("EEH recovery failed for pci device %s" % pci_ids) + tries = 30 + for _ in range(0, tries): + if sorted(vm.get_pci_devices()) != sorted(nic_list_before): + logging.debug("Adapter found after EEH was injection successfully") + return True, last_dmesg_line + time.sleep(1) + return False, last_dmesg_line + + def check_eeh_hit(last_dmesg_line): + """ + Function to check if EEH is successfully hit + """ + tries = 30 + for _ in range(0, tries): + logs, last_dmesg_line = get_new_dmesg_logs(last_dmesg_line) + if any('EEH: Frozen' in log for log in logs): + return True, last_dmesg_line + time.sleep(1) + return False, last_dmesg_line + + def check_eeh_removed(last_dmesg_line): + """ + Function to check if PCI PT is recovered successfully + """ + tries = 30 + for _ in range(0, tries): + cmd = "dmesg" + res_status, res_output = session.cmd_status_output(cmd) + if 'permanent failure' in res_output and res_status == 0: + time.sleep(10) + return True, last_dmesg_line + time.sleep(1) + return False, last_dmesg_line + + try: + device_hotplug() + if device_type == "NIC": + test_ping() + test_eeh_nic() + if device_type == "STORAGE": + check_device() + test_eeh_storage() + device_hotunplug() + + finally: + cmd = "dmesg" + res_output = session.cmd_output(cmd) + logging.debug("complete dmesg Logs:: %s", res_output) + backup_xml.sync() + if session: + session.close() + if arch == "ppc64le": + if libvirt_version.version_compare(3, 10, 0): + pci_devs.sort() + reattach_device(pci_devs, pci_ids) + else: + if not libvirt_version.version_compare(3, 10, 0): + pci_devs.sort() + reattach_device(pci_devs, pci_ids)