From 1d9a508e7cd5549b89e0455e3d8b61b09ad76fa4 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 19 Feb 2026 14:56:52 -0800
Subject: [PATCH] [gguf] Convert to plain tensor earlier in
 dequantize_gguf_tensor

Once dequantize_gguf_tensor fetches the quant_type attributed from the
GGUFParamter tensor subclass, there is no further need of running the
actual dequantize operations on the Tensor subclass, we can just convert
to plain tensor right away.

This not only makes PyTorch eager faster, but reduces torch.compile
tracer compile time from 36 seconds to 10 seconds, because there is lot
less code to trace now.
---
 src/diffusers/quantizers/gguf/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index adb429688723..e0ad0e1cce42 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -516,6 +516,9 @@ def dequantize_gguf_tensor(tensor):
 
     block_size, type_size = GGML_QUANT_SIZES[quant_type]
 
+    # Conver to plain tensor to avoid unnecessary __torch_function__ overhead.
+    tensor = tensor.as_tensor()
+
     tensor = tensor.view(torch.uint8)
     shape = _quant_shape_from_byte_shape(tensor.shape, type_size, block_size)
 
@@ -525,7 +528,7 @@ def dequantize_gguf_tensor(tensor):
     dequant = dequant_fn(blocks, block_size, type_size)
     dequant = dequant.reshape(shape)
 
-    return dequant.as_tensor()
+    return dequant
 
 
 class GGUFParameter(torch.nn.Parameter):