From 4c4c67c5b4d480d71fde302fbdf1798d49851828 Mon Sep 17 00:00:00 2001
From: Misbah Anjum N <misanjum@linux.vnet.ibm.com>
Date: Fri, 14 Feb 2025 15:25:19 +0530
Subject: [PATCH 1/2] multivm-stress:Update script to test all edgecases

This patch captures multiple edge cases to test multivm scenarios. The following updates are added:

add stress_time parameter to run stress test for n seconds before starting stress_events
add debug_dir parameter to save the the debug files
add dump_options parameter to specify virsh dump type
update guest on_crash value to preserve in case of crash
add function check_call_traces to check for any call trace in dmesg
during stress, check for guest state and call traces every ten minutes
if any crashed vms, dump the vm to the debug_dir for further analysis
run stress_events in the remaining stable vms if present, else skip
check for error messages and fail the test if found

Signed-off-by: Misbah Anjum N <misanjum@linux.vnet.ibm.com>
---
 .../src/multivm_stress/multivm_stress.py      | 211 ++++++++++++++++--
 1 file changed, 193 insertions(+), 18 deletions(-)

diff --git a/libvirt/tests/src/multivm_stress/multivm_stress.py b/libvirt/tests/src/multivm_stress/multivm_stress.py
index cd74b0c122f..9f7107fa78e 100644
--- a/libvirt/tests/src/multivm_stress/multivm_stress.py
+++ b/libvirt/tests/src/multivm_stress/multivm_stress.py
@@ -1,8 +1,11 @@
 import logging as log
+import time
 
 from virttest import utils_stress
 from virttest import error_context
 from virttest import utils_test
+from virttest import virsh
+from virttest.libvirt_xml import vm_xml
 
 
 # Using as lower capital is not the best way to do, but this is just a
@@ -20,38 +23,210 @@ def run(test, params, env):
 
     guest_stress = params.get("guest_stress", "no") == "yes"
     host_stress = params.get("host_stress", "no") == "yes"
-    stress_events = params.get("stress_events", "reboot")
+    stress_events = params.get("stress_events", "")
+    stress_time = params.get("stress_time", "30")
+    debug_dir = params.get("debug_dir", "/home/")
+    dump_options = params.get("dump_options", "--memory-only --bypass-cache")
     vms = env.get_all_vms()
     vms_uptime_init = {}
+
     if "reboot" not in stress_events:
         for vm in vms:
             vms_uptime_init[vm.name] = vm.uptime()
-    stress_event = utils_stress.VMStressEvents(params, env)
+
     if guest_stress:
+        # change the on_crash value to "preserve" when guest crashes
+        for vm in vms:
+            logging.debug("Setting on_crash to preserve in %s" % vm.name)
+            vmxml = vm_xml.VMXML.new_from_inactive_dumpxml(vm.name)
+            if vm.is_alive():
+                vm.destroy(gracefully=False)
+            vmxml.on_crash = "preserve"
+            vmxml.sync()
+            vm.start()
+
         try:
             utils_test.load_stress("stress_in_vms", params=params, vms=vms)
         except Exception as err:
-            test.fail("Error running stress in vms: %s" % err)
+            test.fail("Error running stress in vms: %s" % str(err))
+
     if host_stress:
         if params.get("host_stress_args", ""):
             params["stress_args"] = params.get("host_stress_args")
         try:
             utils_test.load_stress("stress_on_host", params=params)
         except Exception as err:
-            test.fail("Error running stress in host: %s" % err)
-    try:
-        stress_event.run_threads()
-    finally:
-        stress_event.wait_for_threads()
-        if guest_stress:
-            utils_test.unload_stress("stress_in_vms", params=params, vms=vms)
-        if host_stress:
-            utils_test.unload_stress("stress_on_host", params=params)
-        if "reboot" not in stress_events:
-            fail = False
+            test.fail("Error running stress in host: %s" % str(err))
+
+    stress_timer = int(stress_time)
+    fail = False
+    found_traces = False
+    failed_vms = []
+    login_error_vms = []
+    unexpected_reboot_vms = []
+    error_message = ""
+
+    if guest_stress:
+        # check for any call traces in guest dmesg while stress is running
+        def check_call_traces(vm):
+            nonlocal stress_timer
+            found_trace = False
+            try:
+                retry_login = True
+                retry_times = 0
+                while retry_login:
+                    try:
+                        retry_login = False
+                        session = vm.wait_for_login(timeout=100)
+                        if vm in login_error_vms:
+                            login_error_vms.remove(vm)
+
+                    except Exception:
+                        stress_timer -= 150
+                        if vm in login_error_vms:
+                            return False
+
+                        retry_login = True
+                        retry_times += 1
+                        if retry_times == 3:
+                            logging.debug("Error in logging into %s" % vm.name)
+                            if vm not in login_error_vms:
+                                login_error_vms.append(vm)
+                            return False
+
+                        time.sleep(30)
+                        stress_timer -= 30
+
+                dmesg = session.cmd("dmesg")
+                dmesg_level = session.cmd("dmesg -l emerg,alert,crit")
+                if "Call Trace" in dmesg or len(dmesg_level) >= 1:
+                    logging.debug("Call trace found in %s" % vm.name)
+                    if vm not in failed_vms:
+                        failed_vms.append(vm)
+                    found_trace = True
+                session.close()
+
+            except Exception as err:
+                test.error("Error getting dmesg of %s due to %s" % (vm.name, str(err)))
+            return found_trace
+
+        # run stress for stress_time seconds
+        logging.debug("Sleeping for %s seconds waiting for stress completion" % stress_time)
+        stress_time = int(stress_time)
+
+        # check domstate of vms after stress_time
+        if stress_time < 600:
+            time.sleep(stress_time)
             for vm in vms:
-                if vm.uptime() < vms_uptime_init[vm.name]:
-                    logging.error("Unexpected reboot of VM: %s between test", vm.name)
+                if vm.state() != "running":
+                    logging.debug("%s state is %s" % (vm.name, vm.state()))
+                    failed_vms.append(vm)
                     fail = True
-            if fail:
-                test.fail("Unexpected VM reboot detected")
+                else:
+                    found_traces = check_call_traces(vm)
+                    if found_traces:
+                        fail = True
+                    time.sleep(2)
+
+        # check domstate of vms for every 5 minutes during stress_time
+        else:
+            all_failed = False
+            number_of_checks = int(stress_time / 600)
+            delta_time = int(stress_time % 600)
+            for itr in range(number_of_checks):
+                if len(failed_vms) == len(vms) or len(login_error_vms) == len(vms):
+                    all_failed = True
+                    break
+                if stress_timer <= 0:
+                    break
+                time.sleep(600)
+                for vm in vms:
+                    if vm.state() != "running":
+                        logging.debug("%s state is %s" % (vm.name, vm.state()))
+                        if vm not in failed_vms:
+                            failed_vms.append(vm)
+                        fail = True
+                    else:
+                        found_traces = check_call_traces(vm)
+                        if found_traces:
+                            fail = True
+                        time.sleep(3)
+                        stress_timer -= 3
+
+            if delta_time > 0 and stress_timer > 0 and not all_failed:
+                time.sleep(delta_time)
+                for vm in vms:
+                    if vm.state() != "running":
+                        logging.debug("%s state is %s" % (vm.name, vm.state()))
+                        if vm not in failed_vms:
+                            failed_vms.append(vm)
+                        fail = True
+                    else:
+                        found_traces = check_call_traces(vm)
+                        if found_traces:
+                            fail = True
+                        time.sleep(3)
+                        stress_timer -= 3
+
+        # virsh dump the failed vms into debug_dir
+        if fail:
+            for vm in failed_vms:
+                if vm.state() != "shut off":
+                    logging.debug("Dumping %s to debug_dir %s" % (vm.name, debug_dir))
+                    virsh.dump(vm.name, debug_dir+vm.name+"-core", dump_options, ignore_status=False, debug=True)
+                    logging.debug("Successfully dumped %s as %s-core" % (vm.name, vm.name))
+                else:
+                    logging.debug("Cannot dump %s as it is in shut off state" % vm.name)
+            failed_vms_string = ", ".join(vm.name for vm in failed_vms)
+            error_message = "Failure in " + failed_vms_string + " while running stress. "
+
+        if login_error_vms:
+            login_error_vms_string = ", ".join(vm.name for vm in login_error_vms)
+            error_message += "Login error in " + login_error_vms_string + " while running stress. "
+
+        if len(failed_vms) == len(vms) or len(login_error_vms) == len(vms):
+            error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS"
+            test.fail(error_message)
+
+    # run STRESS EVENTS in the remaining stable guests
+    if len(failed_vms) < len(vms) and len(login_error_vms) < len(vms):
+        for vm in failed_vms:
+            if vm in vms:
+                vms.remove(vm)
+        for vm in login_error_vms:
+            if vm in vms:
+                vms.remove(vm)
+
+        if len(vms) == 0:
+            error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS"
+            test.fail(error_message)
+
+        new_vms = ", ".join(vm.name for vm in vms)
+        try:
+            if stress_events != "":
+                logging.debug("Running stress_events in %s" % new_vms)
+                stress_event = utils_stress.VMStressEvents(params, env, vms)
+                stress_event.run_threads()
+                stress_event.wait_for_threads()
+
+            if guest_stress:
+                utils_test.unload_stress("stress_in_vms", params=params, vms=vms)
+
+            if host_stress:
+                utils_test.unload_stress("stress_on_host", params=params)
+
+            if "reboot" not in stress_events:
+                for vm in vms:
+                    if vm.uptime() < vms_uptime_init[vm.name]:
+                        logging.debug("Unexpected reboot of VM: %s between test", vm.name)
+                        unexpected_reboot_vms.append(vm)
+                unexpected_reboot_vms_string = ", ".join(vm.name for vm in unexpected_reboot_vms)
+                if unexpected_reboot_vms:
+                    error_message += "Unexpected reboot of guest(s) " + unexpected_reboot_vms_string + ". "
+
+        except Exception as err:
+            error_message += "Failure running STRESS EVENTS in " + new_vms + " due to" + str(err)
+
+    # check the test status
+    if error_message:
+        test.fail(error_message)
\ No newline at end of file

From fef2dbd1e4e72b694804f8ec31275c262ac6720d Mon Sep 17 00:00:00 2001
From: Tasmiya Nalatwad <tasmiya@linux.vnet.ibm.com>
Date: Wed, 5 Mar 2025 18:07:19 +0530
Subject: [PATCH 2/2] Added EEH test cases for PCI PT NIC devices Added 2 test
 cases 1. Inject EEH from host console for pci pt device 2. Inject EEH from
 guest console for pci pt device

Signed-off-by: Tasmiya Nalatwad <tasmiya@linux.vnet.ibm.com>
---
 .../pci/libvirt_pci_passthrough_eeh.cfg       |  51 +++
 .../pci/libvirt_pci_passthrough_eeh.py        | 376 ++++++++++++++++++
 2 files changed, 427 insertions(+)
 create mode 100644 libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough_eeh.cfg
 create mode 100644 libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough_eeh.py

diff --git a/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough_eeh.cfg b/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough_eeh.cfg
new file mode 100644
index 00000000000..9783ae069ca
--- /dev/null
+++ b/libvirt/tests/cfg/passthrough/pci/libvirt_pci_passthrough_eeh.cfg
@@ -0,0 +1,51 @@
+- libvirt_pci_passthrough_eeh:
+    virt_test_type = libvirt
+    provider = io-github-autotest-libvirt
+    type = libvirt_pci_passthrough_eeh
+    iteration_val = 2
+    no s390-virtio
+    variants:
+        - Normal_passthrough:
+            libvirt_pci_SRIOV = no
+            # Removing SRIOV as of now, as curently we don't test this
+            # - SRIOV:
+            #     libvirt_pci_SRIOV = yes
+            #     vf_filter = "Virtual Function"
+            # Enter the no.of Virtual Function's to be cr
+            # for each Physical Function.
+            #     number_vfs = 4
+    variants:
+        - PCI_NIC:
+            libvirt_pci_device_type = "NIC"
+            # Please enter the PCI device label for
+            # a network device. We will attach this
+            # device to guest. Then this network device
+            # will be unavailable on host.
+            # E.g: 0000:05:00.0
+            libvirt_pci_net_dev_label = "ENTER.YOUR.PCI.LABEL"
+            libvirt_pci_net_dev_name = "ENTER.YOUR.DEVICE.NAME"
+            # Please enter the ip what is used by the device
+            # you are going to attach to guest.
+            libvirt_pci_net_ip = "ENTER.YOUR.IP"
+            # Please enter a available ip from the net device.
+            # We need to ping it after attaching pci device
+            # to guest to verify this device works well in guest.
+            libvirt_pci_server_ip = "ENTER.YOUR.SERVER.IP"
+            # Enter netmask in CIDR notation
+            libvirt_pci_net_mask = "ENTER.YOUR.NETMASK"
+            # enter timeout value
+            timeout = "ENTER.YOUR.TIMEOUT"
+            model = "ENTER.YOUR.DEVICE.MODEL"
+            index = "1"
+        - PCI_STORAGE:
+            libvirt_pci_device_type = "STORAGE"
+            # Please enter the PCI device label for
+            # a storage device. We will attach this
+            # device to guest.
+            # E.g: pci_0000_0d_00_0
+            libvirt_pci_storage_dev_label = "ENTER.YOUR.PCI.LABEL"
+    variants:
+        - passthrough_eeh_guest:
+            eeh_guest = "yes"
+        - passthrough_suspend_host:
+            eeh_host = "yes"
diff --git a/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough_eeh.py b/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough_eeh.py
new file mode 100644
index 00000000000..82c2d862d82
--- /dev/null
+++ b/libvirt/tests/src/passthrough/pci/libvirt_pci_passthrough_eeh.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Author: Tasmiya.Nalatwad<tasmiya@linux.vnet.ibm.com>
+# EEH Test on pci passthrough devices
+
+import logging as log
+import aexpect
+import platform
+import time
+
+from avocado.utils import process
+
+from virttest import virsh
+from virttest.libvirt_xml.vm_xml import VMXML
+from virttest.libvirt_xml.nodedev_xml import NodedevXML
+from virttest.test_setup import PciAssignable
+from virttest import utils_test, utils_misc
+from virttest.libvirt_xml.devices.controller import Controller
+from virttest import utils_package
+from virttest import libvirt_version
+
+# Using as lower capital is not the best way to do, but this is just a
+# workaround to avoid changing the entire file.
+logging = log.getLogger('avocado.' + __name__)
+
+
+def run(test, params, env):
+    """
+    Test EEH functionality on PCI Passthrough device
+    of a libvirt guest
+    a). NIC:
+        1. Get params.
+        2. Get the pci device function.
+        3. Start guest
+        4. prepare device xml to be attached
+        5. hotplug the device
+        6. check device hotplugged or not
+        7. Ping to server_ip from guest
+        8. Get location code for the pci device
+        9. Trigger Error Injection on pci device
+        10. Check Error Injection recovery on the device
+        11. check device availability inside guest
+    b). STORAGE:
+        1. Get params.
+        2. Get the pci device function.
+        3. Start guest
+        4. prepare device xml to be attached
+        5. hotplug the device
+        6. check device hotplugged or not
+        7. check STORAGE device inside guest
+        8. Get location code for the pci device
+        9. mount file (/dev/nvme*) on /mnt
+        9. Trigger EEH on pci device
+        10. Check EEH recovery on the device
+        11. check device availability inside guest
+    """
+    # get the params from params
+    vm_name = params.get("main_vm")
+    vm = env.get_vm(vm_name)
+    device_name = params.get("libvirt_pci_net_dev_name", "ENTER_YOUR.DEV.NAME")
+    device_type = params.get("libvirt_pci_device_type", "NIC")
+    pci_id = params.get("libvirt_pci_net_dev_label", "ENTER_YOUR.DEV.LABEL")
+    net_ip = params.get("libvirt_pci_net_ip", "ENTER_YOUR.IP")
+    server_ip = params.get("libvirt_pci_server_ip",
+                           "ENTER_YOUR.SERVER.IP")
+    netmask = params.get("libvirt_pci_net_mask", "ENTER_YOUR.MASK")
+    timeout = int(params.get("timeout", "ENTER_YOUR.TIMEOUT.VALUE"))
+
+    vmxml = VMXML.new_from_inactive_dumpxml(vm_name)
+    backup_xml = vmxml.copy()
+    devices = vmxml.get_devices()
+    pci_devs = []
+
+    cntlr_index = params.get("index", "1")
+    cntlr_model = params.get("model", "pci-root")
+    cntlr_type = "pci"
+
+    controllers = vmxml.get_controllers(cntlr_type, cntlr_model)
+    index_list = []
+    for controller in controllers:
+        index_value = controller.get("index")
+        if index_value is not None:
+            index_list.append(int(index_value))
+    if index_list:
+        next_index = max(index_list) + 1
+    else:
+        next_index = int(cntlr_index)
+
+    controller = Controller("controller")
+    controller.type = cntlr_type
+    controller.index = str(next_index)
+    controller.model = cntlr_model
+
+    devices.append(controller)
+    vmxml.set_devices(devices)
+    vmxml.sync()
+    if not vm.is_alive():
+        vm.start()
+        session = vm.wait_for_login()
+    if not utils_package.package_install(["ppc64-diag",
+                                          "librtas", "powerpc-utils"],
+                                         session, 360):
+        test.cancel('Fail on dependencies installing')
+
+    output = session.cmd_output("ip link")
+    logging.debug("checking for output - %s", output)
+    nic_list_before = str(output.splitlines())
+    logging.debug("nic_list before hotplug %s", nic_list_before)
+    obj = PciAssignable()
+    # get all functions id's
+    print("Tasmiya PCI_ID : %s" % pci_id)
+    pci_ids = obj.get_same_group_devs(pci_id)
+    for val in pci_ids:
+        temp = val.replace(":", "_")
+        pci_devs.extend(["pci_"+temp])
+    pci_val = pci_devs[0].replace(".", "_")
+    if device_type == "NIC":
+        if not vm.is_alive():
+            vm.start()
+        session = vm.wait_for_login()
+        nic_list_before = vm.get_pci_devices()
+    pci_xml = NodedevXML.new_from_dumpxml(pci_val)
+    pci_address = pci_xml.cap.get_address_dict()
+    dev = VMXML.get_device_class('hostdev')()
+    dev.mode = 'subsystem'
+    dev.type = 'pci'
+    dev.managed = 'no'
+    dev.source = dev.new_source(**pci_address)
+    arch = platform.machine()
+    ioa_system_info = params.get("ioa_system_info", "ioa-bus-error")
+    func = params.get("function", "6")
+    max_freeze = params.get("max_freeze", "5")
+    eeh_guest = params.get("eeh_guest", "no")
+    eeh_host = params.get("eeh_host", "no")
+
+    def test_ping():
+        try:
+            output = session.cmd_output('lspci -nn | grep "%s"' % device_name)
+            nic_id = str(output).split(' ', 1)[0]
+            nic_name = str(utils_misc.get_interface_from_pci_id(nic_id,
+                                                                session))
+            session.cmd("ip addr flush dev %s" % nic_name)
+            session.cmd("ip addr add %s/%s dev %s"
+                        % (net_ip, netmask, nic_name))
+            session.cmd("ip link set %s up" % nic_name)
+            s_ping, o_ping = utils_test.ping(server_ip, count=5,
+                                             interface=net_ip, timeout=30,
+                                             session=session)
+            logging.info(s_ping)
+            logging.info(o_ping)
+            if s_ping:
+                test.fail("Ping test failed")
+        except aexpect.ShellError as detail:
+            test.error("Succeed to set ip on guest, but failed "
+                       "to bring up interface.\n"
+                       "Detail: %s." % detail)
+
+    def detach_device(pci_devs, pci_ids):
+        # detaching the device from host
+        for pci_value, pci_node in zip(pci_devs, pci_ids):
+            pci_value = pci_value.replace(".", "_")
+            cmd = "lspci -ks %s | grep 'Kernel driver in use' |\
+                   awk '{print $5}'" % pci_node
+            driver_name = process.run(cmd, shell=True).stdout_text.strip()
+            if driver_name == "vfio-pci":
+                logging.debug("device already detached")
+            else:
+                if virsh.nodedev_detach(pci_value).exit_status:
+                    test.error("Hostdev node detach failed")
+                driver_name = process.run(cmd, shell=True).stdout_text.strip()
+                if driver_name != "vfio-pci":
+                    test.error("driver bind failed after detach")
+
+    def reattach_device(pci_devs, pci_ids):
+        # reattach the device to host
+        for pci_value, pci_node in zip(pci_devs, pci_ids):
+            pci_value = pci_value.replace(".", "_")
+            cmd = "lspci -ks %s | grep 'Kernel driver in use' |\
+                   awk '{print $5}'" % pci_node
+            driver_name = process.run(cmd, shell=True).stdout_text.strip()
+            if driver_name != "vfio-pci":
+                logging.debug("device already attached")
+            else:
+                if virsh.nodedev_reattach(pci_value).exit_status:
+                    test.fail("Hostdev node reattach failed")
+                driver_name = process.run(cmd, shell=True).stdout_text.strip()
+                if driver_name == "vfio-pci":
+                    test.error("driver bind failed after reattach")
+
+    def check_attach_pci():
+        session = vm.wait_for_login()
+        output = session.cmd_output("ip link")
+        nic_list_after = str(output.splitlines())
+        logging.debug(nic_list_after)
+        return nic_list_after != nic_list_before
+
+    def device_hotplug():
+        if arch == "ppc64le":
+            if libvirt_version.version_compare(3, 10, 0):
+                detach_device(pci_devs, pci_ids)
+        else:
+            if not libvirt_version.version_compare(3, 10, 0):
+                detach_device(pci_devs, pci_ids)
+        # attach the device in hotplug mode
+        result = virsh.attach_device(vm_name, dev.xml,
+                                     flagstr="--live", debug=True)
+        if result.exit_status:
+            test.error(result.stdout.strip())
+        else:
+            logging.debug(result.stdout.strip())
+        if not utils_misc.wait_for(check_attach_pci, timeout):
+            test.fail("timeout value is not sufficient")
+
+    # detach hot plugged device
+    def device_hotunplug():
+        result = virsh.detach_device(vm_name, dev.xml,
+                                     flagstr="--live", debug=True)
+        if result.exit_status:
+            test.fail(result.stdout.strip())
+        else:
+            logging.debug(result.stdout.strip())
+        # Fix me
+        # the purpose of waiting here is after detach the device from
+        #  guest it need time to perform any other operation on the device
+        time.sleep(timeout)
+        if not libvirt_version.version_compare(3, 10, 0):
+            pci_devs.sort()
+            reattach_device(pci_devs, pci_ids)
+
+    def check_device():
+        # Get the result of "fdisk -l" in guest, and
+        # compare the result with fdisk_list_before.
+        output = session.cmd_output("fdisk -l|grep \"Disk identifier:\"")
+        fdisk_list_after = output.splitlines()
+        if fdisk_list_after == fdisk_list_before:
+            test.fail("Didn't find the disk attached to guest.")
+
+    def get_new_dmesg_logs(last_dmesg_line):
+        """
+        Fetch new `dmesg` logs since the last pointer.
+        """
+        cmd = "dmesg"
+        res_status, res_output = session.cmd_status_output(cmd)
+        if res_status != 0:
+            test.fail("Failed to fetch dmesg logs, status: %s, output: %s" % \
+                      (res_status, res_output))
+        logs = res_output.splitlines()
+        if last_dmesg_line is not None:
+            # Get logs after the last known line
+            try:
+                idx = logs.index(last_dmesg_line)
+                new_logs = logs[idx + 1:]
+            except ValueError:
+                new_logs = logs
+        else:
+            new_logs = logs
+        return new_logs, logs[-1] if logs else None
+
+    def test_eeh_nic():
+        cmd = "echo %s > /sys/kernel/debug/powerpc/eeh_max_freezes" % max_freeze
+        process.run(cmd, shell=True)
+        loc_code = str(utils_misc.get_location_code(pci_id))
+        num_of_miss = 0
+        last_dmesg_line = None  # Initialize the last dmesg line pointer
+        pass_hit = 0
+        for num_of_hit in range(int(max_freeze)):
+            if num_of_miss < 4:
+                # Inject EEH error using below command
+                eeh_cmd = "errinjct %s -f %s -p %s -m 0" % (ioa_system_info, func, loc_code)
+                if eeh_guest == "yes":
+                    session.cmd(eeh_cmd)
+                if eeh_host == "yes":
+                    process.run(eeh_cmd, shell=True)
+                is_hit, last_dmesg_line = check_eeh_hit(last_dmesg_line)
+                if not is_hit:
+                    num_of_miss += 1
+                    if num_of_hit >= 1 and pass_hit != 0:
+                        test.fail("Failed to inject EEH after %s sucessfull attempt for %s. \
+                                  Please check dmesg logs" % (pass_hit, pci_ids))
+                    logging.debug("PCI Device %s EEH hit failed" % pci_ids)
+                    continue
+                is_recovered, last_dmesg_line = check_eeh_pci_device_recovery(last_dmesg_line)
+                if not is_recovered:
+                    test.fail("PCI device %s recovery failed after %s EEH" % (pci_ids, num_of_hit))
+            else:
+                test.fail("Failed to Inject EEH for 5 times")
+            pass_hit += 1
+        is_removed, last_dmesg_line = check_eeh_removed(last_dmesg_line)
+        if is_removed:
+            logging.debug("PCI Device %s removed successfully" % pci_ids)
+        else:
+            test.fail("PCI Device %s failed to permanetly disable after max hit" % pci_ids)
+
+    def check_eeh_pci_device_recovery(last_dmesg_line):
+        """
+        Check if the pci device is recovered successfully after injecting EEH
+        """
+        tries = 60
+        for _ in range(0, tries):
+            logs, last_dmesg_line = get_new_dmesg_logs(last_dmesg_line)
+            if any('permanent failure' in log for log in logs):
+                logging.debug("TEST WILL FAIL AS PERMANENT FAILURE IS SEEN")
+            elif any('EEH: Recovery successful.' in log for log in logs):
+                logging.debug("waiting for pci device to recover %s", pci_ids)
+                break
+            time.sleep(5)
+        else:
+            logging.debug("EEH recovery failed for pci device %s" % pci_ids)
+        tries = 30
+        for _ in range(0, tries):
+            if sorted(vm.get_pci_devices()) != sorted(nic_list_before):
+                logging.debug("Adapter found after EEH was injection successfully")
+                return True, last_dmesg_line
+            time.sleep(1)
+        return False, last_dmesg_line
+
+    def check_eeh_hit(last_dmesg_line):
+        """
+        Function to check if EEH is successfully hit
+        """
+        tries = 30
+        for _ in range(0, tries):
+            logs, last_dmesg_line = get_new_dmesg_logs(last_dmesg_line)
+            if any('EEH: Frozen' in log for log in logs):
+                return True, last_dmesg_line
+            time.sleep(1)
+        return False, last_dmesg_line
+
+    def check_eeh_removed(last_dmesg_line):
+        """
+        Function to check if PCI PT is recovered successfully
+        """
+        tries = 30
+        for _ in range(0, tries):
+            cmd = "dmesg"
+            res_status, res_output = session.cmd_status_output(cmd)
+            if 'permanent failure' in res_output and res_status == 0:
+                time.sleep(10)
+                return True, last_dmesg_line
+            time.sleep(1)
+        return False, last_dmesg_line
+
+    try:
+        device_hotplug()
+        if device_type == "NIC":
+            test_ping()
+            test_eeh_nic()
+        if device_type == "STORAGE":
+            check_device()
+            test_eeh_storage()
+        device_hotunplug()
+
+    finally:
+        cmd = "dmesg"
+        res_output = session.cmd_output(cmd)
+        logging.debug("complete dmesg Logs:: %s", res_output)
+        backup_xml.sync()
+        if session:
+            session.close()
+        if arch == "ppc64le":
+            if libvirt_version.version_compare(3, 10, 0):
+                pci_devs.sort()
+                reattach_device(pci_devs, pci_ids)
+        else:
+            if not libvirt_version.version_compare(3, 10, 0):
+                pci_devs.sort()
+                reattach_device(pci_devs, pci_ids)