From 45e00f4dd36f876236fb5e7a97fb6b9227d31e48 Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Thu, 12 Feb 2026 10:19:15 +0800 Subject: [PATCH 1/2] [compute]: always submit DeleteVmGC on rollback failure regardless of GC_ELIGIBLE Resolves: ZSTAC-68874 Change-Id: I19d28b38acec36b6c420fd325c1e074fae7ed2d4 --- .../compute/vm/VmCreateOnHypervisorFlow.java | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/compute/src/main/java/org/zstack/compute/vm/VmCreateOnHypervisorFlow.java b/compute/src/main/java/org/zstack/compute/vm/VmCreateOnHypervisorFlow.java index 5b054dede75..a90e3123584 100755 --- a/compute/src/main/java/org/zstack/compute/vm/VmCreateOnHypervisorFlow.java +++ b/compute/src/main/java/org/zstack/compute/vm/VmCreateOnHypervisorFlow.java @@ -79,20 +79,25 @@ public void run(MessageReply reply) { spec.getVmInventory().getUuid(), spec.getVmInventory().getName(), spec.getDestHost().getUuid(), spec.getDestHost().getName(), reply.getError())); - if (reply.getError().isError(HostErrors.OPERATION_FAILURE_GC_ELIGIBLE)) { - String gcName = String.format("gc-vm-%s-on-host-%s", spec.getVmInventory().getUuid(), spec.getDestHost().getUuid()); - - DeleteVmGC gc = new DeleteVmGC(); - gc.NAME = gcName; - gc.hostUuid = spec.getVmInventory().getHostUuid(); - gc.inventory = spec.getVmInventory(); - if (gc.existedAndNotCompleted()) { - logger.debug(String.format("There is already a DeleteVmGC of vm[uuid:%s] " + - "on host[uuid:%s], skip.", spec.getVmInventory().getUuid(), spec.getDestHost().getUuid())); - } else { - gc.submit(); - } + // ZSTAC-68874: Always submit GC task on rollback failure to clean up VM remnants on host + // Previously only submitted GC when error was GC_ELIGIBLE, but detach PCI failures (e.g., MN unavailable) + // don't return GC_ELIGIBLE, causing GPU resources to remain occupied + String gcName = String.format("gc-vm-%s-on-host-%s", spec.getVmInventory().getUuid(), spec.getDestHost().getUuid()); + + DeleteVmGC gc = new DeleteVmGC(); + gc.NAME = gcName; + gc.hostUuid = spec.getVmInventory().getHostUuid(); + gc.inventory = spec.getVmInventory(); + if (gc.existedAndNotCompleted()) { + logger.debug(String.format("There is already a DeleteVmGC of vm[uuid:%s] " + + "on host[uuid:%s], skip.", spec.getVmInventory().getUuid(), spec.getDestHost().getUuid())); } else { + gc.submit(); + logger.debug(String.format("Submitted DeleteVmGC for vm[uuid:%s] on host[uuid:%s] due to rollback failure", + spec.getVmInventory().getUuid(), spec.getDestHost().getUuid())); + } + + if (!reply.getError().isError(HostErrors.OPERATION_FAILURE_GC_ELIGIBLE)) { VmTracerCanonicalEvents.OperateFailOnHypervisorData data = new VmTracerCanonicalEvents.OperateFailOnHypervisorData(); data.setHostUuid(spec.getVmInventory().getHostUuid()); data.setVmUuid(spec.getVmInventory().getUuid()); From 83a1ce551f389f0914ea4bfe83aa3e2053a784c9 Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Fri, 13 Feb 2026 21:58:05 +0800 Subject: [PATCH 2/2] [ci]: trigger CI rerun for flaky tests Resolves: ZSTAC-68874 Change-Id: I7f961d50f055bbbe441b8e4036dedfca688aeecb