diff --git a/fms_mo/custom_ext_kernels/utils.py b/fms_mo/custom_ext_kernels/utils.py index 0d5fffb0..5ab78f8b 100644 --- a/fms_mo/custom_ext_kernels/utils.py +++ b/fms_mo/custom_ext_kernels/utils.py @@ -517,8 +517,8 @@ def exllama_ops_load_and_reg(qcfg=None, run_unit_test=False): need_registration = False else: need_registration = ( - available_packages["exllama_kernels"] - and available_packages["exllamav2_kernels"] + available_packages["gptqmodel_exllama_kernels"] + and available_packages["gptqmodel_exllamav2_kernels"] ) if not need_registration: diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py index 345187b6..ddb1d14d 100644 --- a/fms_mo/modules/linear.py +++ b/fms_mo/modules/linear.py @@ -1583,7 +1583,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x.to(in_dtype) -try: +gptq_available = ( + available_packages["gptqmodel"] + and available_packages["gptqmodel_exllama_kernels"] + and available_packages["gptqmodel_exllamav2_kernels"] +) + +if gptq_available: # Third Party from gptqmodel.nn_modules.qlinear.exllama import ( ExllamaQuantLinear as QLinearExllamaV1, @@ -1882,12 +1888,6 @@ def forward(self, x, force_cuda=False): x.add_(self.bias) return x -except ModuleNotFoundError: - logger.warning( - "GPTQModel is not properly installed. " - "QLinearExv1WI4AF16 and QLinearExv2WI4AF16 wrappers will not be available." - ) - class LinearFuncFPxFwdBwd(torch.autograd.Function): """Linear function using FP24 accumulation, experimental only. @@ -2355,6 +2355,14 @@ def extra_repr(self) -> str: if available_packages["mx"]: QLinear_modules += (QLinearMX,) +if gptq_available: + QLinear_modules += ( + QLinearExllamaV1, + QLinearExllamaV2, + QLinearExv1WI4AF16, + QLinearExv2WI4AF16, + ) + def isinstance_qlinear(module): """ diff --git a/fms_mo/utils/custom_gptq_models.py b/fms_mo/utils/custom_gptq_models.py index f85ddaa9..f1da4c3e 100644 --- a/fms_mo/utils/custom_gptq_models.py +++ b/fms_mo/utils/custom_gptq_models.py @@ -14,41 +14,42 @@ """Allow users to add new GPTQ classes for their custom models easily.""" -# Third Party -from gptqmodel.models.base import BaseGPTQModel - - -class GraniteGPTQForCausalLM(BaseGPTQModel): - """Enable Granite for GPTQ.""" - - layer_type = "GraniteDecoderLayer" - layers_node = "model.layers" - base_modules = ["model.embed_tokens", "model.norm"] - layer_modules = [ - ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], - ["self_attn.o_proj"], - ["mlp.up_proj", "mlp.gate_proj"], - ["mlp.down_proj"], - ] - - -class GraniteMoeGPTQForCausalLM(BaseGPTQModel): - """Enable Granite MOE for GPTQ.""" - - layer_type = "GraniteMoeDecoderLayer" - layers_node = "model.layers" - base_modules = ["model.embed_tokens", "model.norm"] - layer_modules = [ - ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], - ["self_attn.o_proj"], - ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"], - ] - - -# NOTE: Keys in this table are huggingface config."model_type" (see the corresponding field in -# config.json). Make sure you cover the ones in the model family you want to use, as they may -# not be under the same model_type. See Granite as an example. -custom_gptq_classes = { - # "granite": GraniteGPTQForCausalLM, - "granitemoe": GraniteMoeGPTQForCausalLM, -} +# Local +from fms_mo.utils.import_utils import available_packages + +if available_packages["gptqmodel"]: + # Third Party + from gptqmodel.models.base import BaseGPTQModel + + class GraniteGPTQForCausalLM(BaseGPTQModel): + """Enable Granite for GPTQ.""" + + layer_type = "GraniteDecoderLayer" + layers_node = "model.layers" + base_modules = ["model.embed_tokens", "model.norm"] + layer_modules = [ + ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], + ["self_attn.o_proj"], + ["mlp.up_proj", "mlp.gate_proj"], + ["mlp.down_proj"], + ] + + class GraniteMoeGPTQForCausalLM(BaseGPTQModel): + """Enable Granite MOE for GPTQ.""" + + layer_type = "GraniteMoeDecoderLayer" + layers_node = "model.layers" + base_modules = ["model.embed_tokens", "model.norm"] + layer_modules = [ + ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], + ["self_attn.o_proj"], + ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"], + ] + + # NOTE: Keys in this table are huggingface config."model_type" (see the corresponding field in + # config.json). Make sure you cover the ones in the model family you want to use, + # as they may not be under the same model_type. See Granite as an example. + custom_gptq_classes = { + # "granite": GraniteGPTQForCausalLM, + "granitemoe": GraniteMoeGPTQForCausalLM, + }