diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 1d7cfd310fc..652db6263cc 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -48,6 +48,9 @@ jobs: - name: Run unit tests run: poetry run python -m pytest --import-mode=append + - name: Run Linux analyzer unit tests + run: poetry --project . --directory "analyzer/linux" run python -m pytest -v + # see the mypy configuration in pyproject.toml - name: Run mypy run: poetry run mypy diff --git a/analyzer/linux/lib/api/process.py b/analyzer/linux/lib/api/process.py index 5a9fea3c3c7..6fc922bc504 100644 --- a/analyzer/linux/lib/api/process.py +++ b/analyzer/linux/lib/api/process.py @@ -30,7 +30,10 @@ def is_alive(self): return True def get_parent_pid(self): - return int(self.get_proc_status().get("PPid")) + try: + return int(self.get_proc_status().get("PPid")) + except (TypeError, ValueError): + return None def get_proc_status(self): try: diff --git a/analyzer/linux/lib/core/packages.py b/analyzer/linux/lib/core/packages.py index 33d7d0b2e26..695719491a4 100644 --- a/analyzer/linux/lib/core/packages.py +++ b/analyzer/linux/lib/core/packages.py @@ -56,22 +56,25 @@ def _found_target_class(module, name): def _guess_package_name(file_type, file_name): - if "Bourne-Again" in file_type or "bash" in file_type: - return "bash" - elif "Zip archive" in file_type: - return "zip" - elif "gzip compressed data" in file_type: - return "zip" - elif "PDF document" in file_type or file_name.endswith(".pdf"): - return "pdf" - elif "Composite Document File V2 Document" in file_type or file_name.endswith(".doc"): - return "doc" - elif "Microsoft Word" in file_type or file_name.endswith(".docx"): - return "doc" - elif "ELF" in file_type: - return "generic" - elif "Unicode text" in file_type or file_name.endswith(".js"): - return "js" + try: + if "Bourne-Again" in file_type or "bash" in file_type: + return "bash" + elif "Zip archive" in file_type: + return "zip" + elif "gzip compressed data" in file_type: + return "zip" + elif "PDF document" in file_type or file_name.endswith(".pdf"): + return "pdf" + elif "Composite Document File V2 Document" in file_type or file_name.endswith(".doc"): + return "doc" + elif "Microsoft Word" in file_type or file_name.endswith(".docx"): + return "doc" + elif "ELF" in file_type: + return "generic" + elif "Unicode text" in file_type or file_name.endswith(".js"): + return "js" + except (TypeError, AttributeError): + pass return None @@ -101,9 +104,16 @@ def __init__(self, target, **kwargs): self.timeout = kwargs.get("timeout") # Command-line arguments for the target. - _args = self.options.get("arguments", []) - if isinstance(_args, str): - self.args = _args.split() + def _args(): + args = self.options.get("arguments") + if isinstance(args, list): + return args + if isinstance(args, str): + return args.split() + return [] + + self.args = _args() + # Choose an analysis method (or fallback to apicalls) self.method = self.options.get("method", "apicalls") # Should our target be launched as root or not diff --git a/analyzer/linux/pytest.ini b/analyzer/linux/pytest.ini new file mode 100644 index 00000000000..a635c5c0318 --- /dev/null +++ b/analyzer/linux/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = . diff --git a/analyzer/linux/tests/lib/api/test_process.py b/analyzer/linux/tests/lib/api/test_process.py new file mode 100644 index 00000000000..4b8d73fc56f --- /dev/null +++ b/analyzer/linux/tests/lib/api/test_process.py @@ -0,0 +1,97 @@ +from unittest.mock import Mock, patch, mock_open, PropertyMock +from lib.api.process import Process +import base64 + +import pytest + +proc_status = """Name: test-process +Umask: 0002 +State: R (running) +Tgid: 42 +Ngid: 0 +Pid: 42 +PPid: 24""" + +@pytest.fixture +def os_path_exists(monkeypatch): + monkeypatch.setattr("os.path.exists", Mock(return_value=True)) + yield + +@pytest.fixture +def os_path_not_exists(monkeypatch): + monkeypatch.setattr("os.path.exists", Mock(return_value=False)) + yield + +@pytest.fixture +def fake_proc_status_file(monkeypatch): + monkeypatch.setattr("builtins.open", mock_open(read_data=proc_status)) + yield + +ARGS = { + "pid": 42 +} + +def test_init(): + """Initialize Process instances using both args and kwargs""" + kw_args_instance = Process(**ARGS) + assert kw_args_instance.pid == ARGS["pid"] + args_instance = Process(*ARGS.values()) + assert args_instance.pid == ARGS["pid"] + +@pytest.mark.usefixtures("os_path_exists") +def test_proc_alive_states(): + for state in [ + "State: R (running)", + "State: S (sleeping)", + "State: D (waiting)", + "State: T (stopped)", + "State: t (trace stopped)", + "State: W (paging)", + "State: W (waking)", + "State: P (parked)", + ]: + state_file_content = proc_status.replace("State: R (running)", state) + with patch("builtins.open", mock_open(read_data=state_file_content)): + process = Process(**ARGS) + assert process.is_alive() + +@pytest.mark.usefixtures("os_path_exists") +def test_proc_dead_states(): + for state in [ + "State: Z (zombie)", + ]: + state_file_content = proc_status.replace("State: R (running)", state) + with patch("builtins.open", mock_open(read_data=state_file_content)): + process = Process(**ARGS) + alive = process.is_alive() + assert not alive + +@pytest.mark.usefixtures("os_path_not_exists") +def test_proc_file_not_exists(): + process = Process(**ARGS) + assert not process.is_alive() + +@pytest.mark.usefixtures("os_path_exists") +def test_proc_file_corrupt(): + corrupt_status = base64.b64encode(proc_status.encode("utf-8")).decode("utf-8") + with patch("builtins.open", mock_open(read_data=corrupt_status)): + process = Process(**ARGS) + assert not process.is_alive() + +@pytest.mark.usefixtures("os_path_exists", "fake_proc_status_file") +def test_get_ppid(): + process = Process(**ARGS) + assert 24 == process.get_parent_pid() + +@patch("builtins.open", side_effect=FileNotFoundError) +def test_get_ppid_file_not_exists(bopen): + process = Process(**ARGS) + assert process.get_parent_pid() is None + +@patch("subprocess.Popen") +def test_execute(popen): + process = Process(**ARGS) + type(popen.return_value).pid = PropertyMock(return_value=ARGS["pid"]) + assert process.execute(["echo", "this is a test message"]) + assert ARGS["pid"] == process.pid + assert popen.called diff --git a/analyzer/linux/tests/lib/core/test_packages.py b/analyzer/linux/tests/lib/core/test_packages.py new file mode 100644 index 00000000000..1e95b6f08b0 --- /dev/null +++ b/analyzer/linux/tests/lib/core/test_packages.py @@ -0,0 +1,19 @@ +from lib.core.packages import _guess_package_name +import pytest + +@pytest.mark.parametrize("file_type, file_name, expected_package_name", [ + ("", "", None), + ("Bourne-Again", None, "bash"), + ("Zip archive", None, "zip"), + ("gzip compressed data", None, "zip"), + ("PDF document", "test.pdf", "pdf"), + ("Composite Document File V2 Document", "test.docx", "doc"), + ("Microsoft Word", "test.docx", "doc"), + ("ELF", None, "generic"), + ("Unicode text", "malware.js", "js") +]) +def test__guess_package_name(file_type, file_name, expected_package_name): + assert _guess_package_name(file_type, "") == expected_package_name, f"Expected {expected_package_name} for {file_type}, {file_name}" + if file_name: + assert _guess_package_name("", file_name) == expected_package_name, f"Expected {expected_package_name} for {file_type}, {file_name}" + assert _guess_package_name(file_type, file_name) == expected_package_name, f"Expected {expected_package_name} for {file_type}, {file_name}" diff --git a/analyzer/linux/tests/lib/core/test_startup.py b/analyzer/linux/tests/lib/core/test_startup.py new file mode 100644 index 00000000000..7e7cadf4be1 --- /dev/null +++ b/analyzer/linux/tests/lib/core/test_startup.py @@ -0,0 +1,53 @@ +from unittest.mock import patch, Mock +import logging +from logging import StreamHandler +import pytest + +from lib.common.results import NetlogHandler +import lib.core.startup +import lib.core.config + +import lib + +@pytest.fixture +def patch_netloghandler(monkeypatch): + monkeypatch.setattr(NetlogHandler, "__init__", Mock(return_value=None)) + monkeypatch.setattr(NetlogHandler, "connect", Mock()) + yield + +@patch('os.makedirs') +@patch('os.path.exists') +def test_create_folders_path_not_exists(os_path_exists, os_mkdirs): + """Test initial folder creation with paths that do not exist""" + # Fake path not existing + os_path_exists.return_value = False + lib.core.startup.create_folders() + assert os_path_exists.called + # Ensure there is an attempt to create a folder + assert os_mkdirs.called + +@patch('os.makedirs') +@patch('os.path.exists') +def test_create_folders_path_exists(os_path_exists, os_mkdirs): + """Test initial folder creation with paths that already exist""" + # Fake path not existing + os_path_exists.return_value = True + lib.core.startup.create_folders() + assert os_path_exists.called + # Ensure there are no attempts to create a folder + assert not os_mkdirs.called + +@pytest.mark.usefixtures("patch_netloghandler") +@patch("logging.Logger.addHandler") +def test_init_logging(addhandler): + """Ensure init_logging adds the right log handlers""" + lib.core.startup.init_logging() + handlers = [] + # Get a list of all the types of handlers that are being added + for name, args, kwargs in addhandler.mock_calls: + handlers = [*handlers, *[type(arg) for arg in args]] + # Ensure there is a StreamHandler and a NetlogHandler + assert StreamHandler in handlers + assert NetlogHandler in handlers + # Ensure log level is set to DEBUG + assert lib.core.startup.log.level == logging.DEBUG diff --git a/analyzer/linux/tests/modules/packages/test_package.py b/analyzer/linux/tests/modules/packages/test_package.py new file mode 100644 index 00000000000..afff20fb737 --- /dev/null +++ b/analyzer/linux/tests/modules/packages/test_package.py @@ -0,0 +1,35 @@ +import sys +import unittest +import pytest + +import lib + +from lib.core.packages import Package + +@pytest.fixture +def patch_netlogfile(monkeypatch): + class MockNetlogFile: + def init(self, *args): + return + def close(self): + return + monkeypatch.setattr(lib.core.packages, "NetlogFile", MockNetlogFile) + monkeypatch.setattr(lib.core.packages, "append_buffer_to_host", lambda *args: None) + yield + +class TestPackage(unittest.TestCase): + + @pytest.mark.usefixtures("patch_netlogfile") + def test_package_init_args(self): + pkg = Package(sys.executable, options={}) + self.assertEqual(pkg.args, []) + + @pytest.mark.usefixtures("patch_netlogfile") + def test_package_init_args_list(self): + pkg = Package(sys.executable, options={"arguments": ["foo", "bar"]}) + self.assertEqual(pkg.args, ["foo", "bar"]) + + @pytest.mark.usefixtures("patch_netlogfile") + def test_package_init_args_str(self): + pkg = Package(sys.executable, options={"arguments": "foo bar"}) + self.assertEqual(pkg.args, ["foo", "bar"]) diff --git a/analyzer/linux/tests/test_analyzer.py b/analyzer/linux/tests/test_analyzer.py new file mode 100644 index 00000000000..47998705b40 --- /dev/null +++ b/analyzer/linux/tests/test_analyzer.py @@ -0,0 +1,34 @@ +import sys +import unittest + +import pytest + +if sys.platform == "linux": + import analyzer + from analyzer import PROCESS_LIST, SEEN_LIST + +@pytest.mark.skipif(sys.platform != "linux", reason="Requires Linux") +class TestAnalyzer(unittest.TestCase): + + def test_add_pids(self): + """Test add_pids with a variety of valid types""" + # Check that both sets are empty + self.assertEqual(PROCESS_LIST, set()) + self.assertEqual(SEEN_LIST, set()) + + pids = [123, 456, 789] + # Add a list of PIDs + analyzer.add_pids([str(pids[0]), pids[1]]) + # Add a set of PIDs + analyzer.add_pids(set([pids[0], pids[2]])) + # Add a tuple of PIDs + analyzer.add_pids((pids[1], pids[2])) + + self.assertEqual(PROCESS_LIST, set(pids)) + self.assertEqual(SEEN_LIST, set(pids)) + + + def test_add_pids_invalid_var(self): + """Test add_pids with an invalid type""" + with self.assertRaises(TypeError): + analyzer.add_pids(analyzer.add_pids) diff --git a/conf/default/cuckoo.conf.default b/conf/default/cuckoo.conf.default index a0da4fb527b..bf00fe0c2b7 100644 --- a/conf/default/cuckoo.conf.default +++ b/conf/default/cuckoo.conf.default @@ -204,6 +204,9 @@ critical = 60 # shutting down a vm. Default is 300 seconds. vm_state = 300 +# Will kill VM if: Task time = timeout + critical + stuck_seconds +stuck_seconds = 100 + [tmpfs] # only if you using volatility to speedup IO # mkdir -p /mnt/tmpfs diff --git a/docs/book/src/Issues/Debugging_VM_issues.rst b/docs/book/src/Issues/Debugging_VM_issues.rst new file mode 100644 index 00000000000..e439c9cb620 --- /dev/null +++ b/docs/book/src/Issues/Debugging_VM_issues.rst @@ -0,0 +1,142 @@ +============================================ +Debugging Stuck Virtual Machines in Cuckoo/CAPE +============================================ + +This guide outlines a systematic approach to diagnosing and resolving issues where the analysis process hangs ("stucks") indefinitely. + +Problem Triage +============== + +When a task appears stuck (e.g., Python process running, VM running, but no activity), determine which component is frozen: + +1. **Guest OS:** Kernel panic or BSOD inside the VM. +2. **QEMU Process:** Deadlocked hypervisor. +3. **Python Controller:** Waiting on a socket/pipe that will never return data. + +Phase 1: Immediate Diagnostics +============================== + +Check the CPU state of the processes to identify the bottleneck. + +**1. Find Process IDs (PIDs)** + +.. code-block:: bash + + ps aux | grep qemu + ps aux | grep python + +**2. Check CPU Usage** + +.. code-block:: bash + + top -p + +* **100% CPU:** The process is in a tight loop (livelock). +* **0% CPU (State S - Sleeping):** Waiting for network/socket data. (Most common for Python). +* **0% CPU (State D - Disk Sleep):** Waiting for Hardware I/O. + +Phase 2: Debugging the Python Controller +======================================== + +If Python is sleeping (0% CPU), it is likely blocked on a synchronous call waiting for the Guest. + +**Using py-spy (Recommended)** + +Dumps the Python stack trace without pausing execution. + +.. code-block:: bash + + pip install py-spy + sudo py-spy dump --pid + +**What to look for:** +* ``wait_for_completion`` (Looping/Waiting for Agent) +* ``select``, ``poll``, ``recv`` (Waiting for Network I/O) + +Phase 3: Debugging the QEMU Engine +================================== + +If Python is waiting on QEMU, inspect the VM state. + +**1. Check System Calls** + +See what the process is asking the Linux Kernel to do. + +.. code-block:: bash + + sudo strace -p + +* **futex:** Threading deadlock. +* **ppoll/select:** Normal idle state (waiting for guest interrupt). + +**2. QEMU Monitor (QMP)** + +Query the internal state of the hypervisor. + +.. code-block:: bash + + echo "info status" | socat - UNIX-CONNECT:/tmp/qemu-monitor.sock + +**3. Visual Inspection (Screenshot)** + +If using Libvirt, capture a screenshot of the Guest without stopping it to check for BSODs or Popups. + +.. code-block:: bash + + virsh list + virsh screenshot /tmp/debug_screenshot.ppm + +Phase 4: Root Cause Analysis +============================ + +Linking the Python trace to the VM state. + +Scenario A: "The Zombie Success" +-------------------------------- +* **Symptom:** Logs say "Analysis completed successfully", but Python process is still running hours later. +* **Trace:** Python is stuck in ``wait_for_completion`` loop (sleeping). +* **Cause:** The Agent inside the VM died or network was cut *after* sending success, but *before* the Python loop could confirm the shutdown. +* **Fix:** Python enters an infinite retry loop because the error handler just ``continues`` instead of aborting. + +Scenario B: "The Blind Wait" +---------------------------- +* **Symptom:** Python blocked on ``recv`` or ``read``. +* **Cause:** The Agent crashed without closing the TCP socket. Python waits for EOF that never comes. +* **Fix:** Enforce socket timeouts in code. + +Phase 5: Resolution & Code Fixes +================================ + +To prevent future hangs, apply these fixes to ``lib/cuckoo/core/guest.py``. + +**1. Fix the Infinite Loop** +Modify ``wait_for_completion`` to calculate a hard deadline based on configuration. + +.. code-block:: python + + # Calculate Hard Limit + effective_timeout = self.timeout if self.timeout else cfg.timeouts.default + hard_limit = effective_timeout + cfg.timeouts.critical + + while self.do_run: + # Check Hard Limit + if timeit.default_timer() - start > hard_limit: + log.error("Hard Timeout reached! Killing analysis.") + return + +**2. Fix Database Staleness** +Ensure the loop checks the *actual* database state, not cached RAM values (essential for tasks deleted by user). + +.. code-block:: python + + db.session.expire_all() + if db.guest_get_status(self.task_id) is None: + return # Task deleted + +**3. Emergency Cleanup** +If a process is already stuck, kill the zombie pair manually. + +.. code-block:: bash + + kill -9 + kill -9 diff --git a/docs/book/src/Issues/index.rst b/docs/book/src/Issues/index.rst new file mode 100644 index 00000000000..dbf9080f9cf --- /dev/null +++ b/docs/book/src/Issues/index.rst @@ -0,0 +1,10 @@ +.. Introduction chapter frontpage + +Issues +============ + +This chapter will have some examples how to debug some common issues. + +.. toctree:: + + Debugging_VM_issues diff --git a/docs/book/src/index.rst b/docs/book/src/index.rst index 41dd3fcc10d..60b6fd8b734 100644 --- a/docs/book/src/index.rst +++ b/docs/book/src/index.rst @@ -36,3 +36,4 @@ Contents integrations/index development/index finalremarks/index + Issues/index diff --git a/lib/cuckoo/core/guest.py b/lib/cuckoo/core/guest.py index effb77a1710..2a3de974af1 100644 --- a/lib/cuckoo/core/guest.py +++ b/lib/cuckoo/core/guest.py @@ -109,17 +109,25 @@ def get(self, method, *args, **kwargs): r = session.get(url, *args, **kwargs) except requests.ConnectionError as e: raise CuckooGuestError( - f"CAPE Agent failed without error status, please try " - f"upgrading to the latest version of agent.py (>= 0.10) and " - f"notify us if the issue persists. Error: {e}" + "CAPE Agent failed without error status, please try " + "upgrading to the latest version of agent.py (>= 0.10) and " + "notify us if the issue persists. Error: %s", str(e) ) do_raise and r.raise_for_status() return r def get_status_from_db(self) -> str: + # Force SQLAlchemy to dump its cache and look at the real DB with db.session.begin(): - return db.guest_get_status(self.task_id) + db.session.expire_all() + status = db.guest_get_status(self.task_id) + + # Handle the case where the task was already deleted by race condition + if status is None: + return "deleted" + + return status def set_status_in_db(self, status: str): with db.session.begin(): @@ -136,9 +144,9 @@ def post(self, method, *args, **kwargs): r = session.post(url, *args, **kwargs) except requests.ConnectionError as e: raise CuckooGuestError( - f"CAPE Agent failed without error status, please try " - f"upgrading to the latest version of agent.py (>= 0.10) and " - f"notify us if the issue persists. Error: {e}" + "CAPE Agent failed without error status, please try " + "upgrading to the latest version of agent.py (>= 0.10) and " + "notify us if the issue persists. Error: %s", str(e) ) r.raise_for_status() @@ -356,8 +364,35 @@ def start_analysis(self, options): def wait_for_completion(self): count = 0 start = timeit.default_timer() + current_status = "" + consecutive_failures = 0 + # --- REWORKED: TIMEOUT CALCULATION --- + # 1. Detect None/0: If self.timeout is missing/infinite, use system default. + # This prevents "infinite patience" if the task submission was malformed. + effective_timeout = self.timeout + if not effective_timeout: + effective_timeout = cfg.timeouts.default + + # 2. Add Critical Buffer: This is the "Grace Period" for shutdown/reporting. + # e.g., 200s (analysis) + 60s (critical) = 260s Hard Limit. + hard_limit = effective_timeout + cfg.timeouts.critical + + while self.do_run: + # FORCE REFRESH: Tell SQLAlchemy to expire the cache and fetch fresh data + # Note: Depending on your exact Cuckoo version, the session access might vary. + # This is the generic fix: + try: + # Re-fetch the task status directly from DB logic + current_status = self.get_status_from_db() + except Exception: + # If the task was deleted, this query might fail. + # If it fails, we should ABORT. + log.info("Task #%s: Task deleted or DB error. Aborting.", self.task_id) + return + + if current_status != "running": + break - while self.do_run and self.get_status_from_db() == "running": time.sleep(1) if cfg.cuckoo.machinery_screenshots: @@ -369,24 +404,33 @@ def wait_for_completion(self): if count % 5 == 0: log.debug("Task #%s: Analysis is still running (id=%s, ip=%s)", self.task_id, self.vmid, self.ipaddr) - # If the analysis hits the critical timeout, just return straight - # away and try to recover the analysis results from the guest. - if timeit.default_timer() - start > self.timeout: - log.info("Task #%s: End of analysis reached! (id=%s, ip=%s)", self.task_id, self.vmid, self.ipaddr) + # --- REWORKED: HARD STOP ENFORCEMENT --- + # If we exceed the (Timeout + Critical) limit, we kill the loop. + # This handles the case where the Agent is dead (network timeout) + # and the 'except' block below keeps "continuing" forever. + if timeit.default_timer() - start > hard_limit: + log.error( + "Task #%s: Hard Timeout reached! (Running for %ds, Limit %ds). " + "Agent is likely unresponsive.", + self.task_id, + timeit.default_timer() - start, + hard_limit + ) + self.set_status_in_db("failed") return try: status = self.get("/status", timeout=5).json() except (CuckooGuestError, requests.exceptions.ReadTimeout) as e: - # this might fail due to timeouts or just temporary network - # issues thus we don't want to abort the analysis just yet and - # wait for things to recover - log.warning( - "Task #%s: Virtual Machine %s /status failed. This can indicate the guest losing network connectivity. Error: %s", - self.task_id, - self.vmid, - e, - ) + # Add a counter for consecutive failures + consecutive_failures += 1 # You need to init this to 0 outside loop + + log.warning("Task #%s: Agent unreachable (%s/10). Error: %s", self.task_id, consecutive_failures, e) + + # If we fail 10 times in a row (approx 10-15 seconds), give up. + if consecutive_failures > 10: + log.error("Task #%s: Agent is dead. Virtual Machine %s /status failed. This can indicate the guest losing network connectivity. Killing analysis.", self.task_id, self.vmid) + return # Or raise exception continue except Exception as e: log.exception("Task #%s: Virtual machine %s /status failed. %s", self.task_id, self.vmid, e) diff --git a/lib/cuckoo/core/scheduler.py b/lib/cuckoo/core/scheduler.py index f7251bc8c55..3afcd1d0310 100644 --- a/lib/cuckoo/core/scheduler.py +++ b/lib/cuckoo/core/scheduler.py @@ -11,7 +11,9 @@ import sys import threading import time +import traceback from collections import defaultdict +from datetime import datetime from typing import DefaultDict, List, Optional, Tuple from lib.cuckoo.common.cleaners_utils import free_space_monitor @@ -83,6 +85,41 @@ def analysis_finished(self, analysis_manager: AnalysisManager): def do_main_loop_work(self, error_queue: queue.Queue) -> SchedulerCycleDelay: """Return the number of seconds to sleep after returning.""" + # Monitor and kill stuck VMs + with self.analysis_threads_lock: + for analysis in self.analysis_threads: + if not analysis.machine or not analysis.task or not analysis.task.started_on: + continue + + stuck_seconds = self.cfg.timeouts.get("stuck_seconds", 100) + vm_state = self.cfg.timeouts.get("vm_state", 100) + timeout = analysis.task.timeout or self.cfg.timeouts.default + max_runtime = timeout + self.cfg.timeouts.critical + stuck_seconds + vm_state + duration = (datetime.now() - analysis.task.started_on).total_seconds() + if duration > max_runtime: + log.warning( + "Task #%s has been running for %s seconds, which is longer than the configured timeout + critical timeout + 100s. Killing VM.", + analysis.task.id, + duration, + ) + + # Log stack trace of the stuck thread + try: + frame = sys._current_frames().get(analysis.ident) + if frame: + stack_trace = "".join(traceback.format_stack(frame)) + log.warning("Stack trace of stuck thread (Task #%s):\n%s", analysis.task.id, stack_trace) + else: + log.warning("Could not retrieve stack trace for thread %s (Task #%s)", analysis.ident, analysis.task.id) + except Exception: + log.exception("Failed to log stack trace for stuck thread (Task #%s)", analysis.task.id) + + try: + if analysis.machinery_manager and analysis.machine: + analysis.machinery_manager.stop_machine(analysis.machine) + except Exception as e: + log.error("Failed to kill stuck VM for task #%s: %s", analysis.task.id, e) + if self.loop_state == LoopState.STOPPING: # This blocks the main loop until the analyses are finished. self.wait_for_running_analyses_to_finish()