foundation-model-stack · chichun-charlie-liu · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025
@@ -517,8 +517,8 @@ def exllama_ops_load_and_reg(qcfg=None, run_unit_test=False):
         need_registration = False
     else:
         need_registration = (
-            available_packages["exllama_kernels"]
-            and available_packages["exllamav2_kernels"]
+            available_packages["gptqmodel_exllama_kernels"]
+            and available_packages["gptqmodel_exllamav2_kernels"]
         )
 
         if not need_registration:

@@ -1583,7 +1583,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x.to(in_dtype)
 
 
-try:
+gptq_available = (
+    available_packages["gptqmodel"]
+    and available_packages["gptqmodel_exllama_kernels"]
+    and available_packages["gptqmodel_exllamav2_kernels"]
+)
+
+if gptq_available:
     # Third Party
     from gptqmodel.nn_modules.qlinear.exllama import (
         ExllamaQuantLinear as QLinearExllamaV1,
@@ -1882,12 +1888,6 @@ def forward(self, x, force_cuda=False):
                     x.add_(self.bias)
                 return x
 
-except ModuleNotFoundError:
-    logger.warning(
-        "GPTQModel is not properly installed. "
-        "QLinearExv1WI4AF16 and QLinearExv2WI4AF16 wrappers will not be available."
-    )
-
 
 class LinearFuncFPxFwdBwd(torch.autograd.Function):
     """Linear function using FP24 accumulation, experimental only.
@@ -2355,6 +2355,14 @@ def extra_repr(self) -> str:
 if available_packages["mx"]:
     QLinear_modules += (QLinearMX,)
 
+if gptq_available:
+    QLinear_modules += (
+        QLinearExllamaV1,
+        QLinearExllamaV2,
+        QLinearExv1WI4AF16,
+        QLinearExv2WI4AF16,
+    )
+
 
 def isinstance_qlinear(module):
     """

@@ -14,41 +14,42 @@
 
 """Allow users to add new GPTQ classes for their custom models easily."""
 
-# Third Party
-from gptqmodel.models.base import BaseGPTQModel
-
-
-class GraniteGPTQForCausalLM(BaseGPTQModel):
-    """Enable Granite for GPTQ."""
-
-    layer_type = "GraniteDecoderLayer"
-    layers_node = "model.layers"
-    base_modules = ["model.embed_tokens", "model.norm"]
-    layer_modules = [
-        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
-        ["self_attn.o_proj"],
-        ["mlp.up_proj", "mlp.gate_proj"],
-        ["mlp.down_proj"],
-    ]
-
-
-class GraniteMoeGPTQForCausalLM(BaseGPTQModel):
-    """Enable Granite MOE for GPTQ."""
-
-    layer_type = "GraniteMoeDecoderLayer"
-    layers_node = "model.layers"
-    base_modules = ["model.embed_tokens", "model.norm"]
-    layer_modules = [
-        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
-        ["self_attn.o_proj"],
-        ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"],
-    ]
-
-
-# NOTE: Keys in this table are huggingface config."model_type" (see the corresponding field in
-#       config.json). Make sure you cover the ones in the model family you want to use, as they may
-#       not be under the same model_type. See Granite as an example.
-custom_gptq_classes = {
-    # "granite": GraniteGPTQForCausalLM,
-    "granitemoe": GraniteMoeGPTQForCausalLM,
-}
+# Local
+from fms_mo.utils.import_utils import available_packages
+
+if available_packages["gptqmodel"]:
+    # Third Party
+    from gptqmodel.models.base import BaseGPTQModel
+
+    class GraniteGPTQForCausalLM(BaseGPTQModel):
+        """Enable Granite for GPTQ."""
+
+        layer_type = "GraniteDecoderLayer"
+        layers_node = "model.layers"
+        base_modules = ["model.embed_tokens", "model.norm"]
+        layer_modules = [
+            ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+            ["self_attn.o_proj"],
+            ["mlp.up_proj", "mlp.gate_proj"],
+            ["mlp.down_proj"],
+        ]
+
+    class GraniteMoeGPTQForCausalLM(BaseGPTQModel):
+        """Enable Granite MOE for GPTQ."""
+
+        layer_type = "GraniteMoeDecoderLayer"
+        layers_node = "model.layers"
+        base_modules = ["model.embed_tokens", "model.norm"]
+        layer_modules = [
+            ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+            ["self_attn.o_proj"],
+            ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"],
+        ]
+
+    # NOTE: Keys in this table are huggingface config."model_type" (see the corresponding field in
+    #       config.json). Make sure you cover the ones in the model family you want to use,
+    #       as they may not be under the same model_type. See Granite as an example.
+    custom_gptq_classes = {
+        # "granite": GraniteGPTQForCausalLM,
+        "granitemoe": GraniteMoeGPTQForCausalLM,
+    }