From cba3127c707e7432913b619297047f1e8c02e423 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Tue, 7 Jan 2025 16:23:07 -0500 Subject: [PATCH 01/13] test adddition of granite moe Signed-off-by: Abhishek --- .../gptqmodel/models/__init__.py | 1 + .../gptqmodel/models/_const.py | 1 + .../gptqmodel/models/auto.py | 2 + .../gptqmodel/models/granitemoe.py | 49 +++++++++++++++++++ 4 files changed, 53 insertions(+) create mode 100644 plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py index 72383c1d..6c0e3c65 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py @@ -22,6 +22,7 @@ from .gpt_bigcode import GPTBigCodeGPTQ from .gpt_neox import GPTNeoXGPTQ from .granite import GraniteGPTQ +from .granitemoe import GraniteMoeGPTQ from .llama import LlamaGPTQ from .mistral import MistralGPTQ from .mixtral import MixtralGPTQ diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py index 23c4baa3..14e10054 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py @@ -28,6 +28,7 @@ "granite", "gemma", "dbrx_converted", + "granitemoe" ] EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048 diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py index 23a61a87..0b1be8fc 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py @@ -29,6 +29,7 @@ from .gpt_bigcode import GPTBigCodeGPTQ from .gpt_neox import GPTNeoXGPTQ from .granite import GraniteGPTQ +from .granitemoe import GraniteMoeGPTQ from .llama import LlamaGPTQ from .mistral import MistralGPTQ from .mixtral import MixtralGPTQ @@ -43,6 +44,7 @@ "granite": GraniteGPTQ, "dbrx": DbrxGPTQ, "dbrx_converted": DbrxConvertedGPTQ, + "granitemoe": GraniteMoeGPTQ } at_least_one_cuda_v6 = any( diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py new file mode 100644 index 00000000..fe101295 --- /dev/null +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py @@ -0,0 +1,49 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +# Local +from .base import BaseGPTQModel + + +class GraniteMoeGPTQ(BaseGPTQModel): + base_modules = ["model.embed_tokens", "model.norm"] + + layers_node = "model.layers" + layer_type = "GraniteMoeDecoderLayer" + layer_modules = [ + ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], + ["self_attn.o_proj"], + [ + "block_sparse_moe.input_linear.experts.0", + "block_sparse_moe.input_linear.experts.1", + "block_sparse_moe.input_linear.experts.2", + "block_sparse_moe.input_linear.experts.3", + "block_sparse_moe.input_linear.experts.4", + "block_sparse_moe.input_linear.experts.5", + "block_sparse_moe.input_linear.experts.6", + "block_sparse_moe.input_linear.experts.7", + ], + [ + "block_sparse_moe.output_linear.experts.0", + "block_sparse_moe.output_linear.experts.1", + "block_sparse_moe.output_linear.experts.2", + "block_sparse_moe.output_linear.experts.3", + "block_sparse_moe.output_linear.experts.4", + "block_sparse_moe.output_linear.experts.5", + "block_sparse_moe.output_linear.experts.6", + "block_sparse_moe.output_linear.experts.7", + ], + ["block_sparse_moe.router.layer"], + ] From a103f5d55190b16da3d74a40a2a886a7c8f79212 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Tue, 7 Jan 2025 16:25:20 -0500 Subject: [PATCH 02/13] test adddition of granite moe Signed-off-by: Abhishek --- .../src/fms_acceleration_peft/gptqmodel/models/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py index 85c17ee8..b3921ff2 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py @@ -558,7 +558,7 @@ def save_quantized( self.quantize_config.meta_set_versionable( key=META_FIELD_QUANTIZER, value=META_QUANTIZER_GPTQMODEL, - version=__version__, + version="1.0.0", ) # The config, quantize_config and model may be edited in place in save_quantized. From 3fe665ec61fb2696edf07bfa0c236b6d759b6e3e Mon Sep 17 00:00:00 2001 From: Abhishek Date: Tue, 7 Jan 2025 18:57:26 -0500 Subject: [PATCH 03/13] test adddition of granite moe Signed-off-by: Abhishek --- .../gptqmodel/models/granitemoe.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py index fe101295..6423163e 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py @@ -26,24 +26,10 @@ class GraniteMoeGPTQ(BaseGPTQModel): ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], ["self_attn.o_proj"], [ - "block_sparse_moe.input_linear.experts.0", - "block_sparse_moe.input_linear.experts.1", - "block_sparse_moe.input_linear.experts.2", - "block_sparse_moe.input_linear.experts.3", - "block_sparse_moe.input_linear.experts.4", - "block_sparse_moe.input_linear.experts.5", - "block_sparse_moe.input_linear.experts.6", - "block_sparse_moe.input_linear.experts.7", + "block_sparse_moe.input_linear", ], [ - "block_sparse_moe.output_linear.experts.0", - "block_sparse_moe.output_linear.experts.1", - "block_sparse_moe.output_linear.experts.2", - "block_sparse_moe.output_linear.experts.3", - "block_sparse_moe.output_linear.experts.4", - "block_sparse_moe.output_linear.experts.5", - "block_sparse_moe.output_linear.experts.6", - "block_sparse_moe.output_linear.experts.7", + "block_sparse_moe.output_linear", ], ["block_sparse_moe.router.layer"], ] From 775e9fb5e2b19c227c1df903f3632cb9e8d07c4d Mon Sep 17 00:00:00 2001 From: Abhishek Date: Tue, 7 Jan 2025 20:15:27 -0500 Subject: [PATCH 04/13] test adddition of granite moe Signed-off-by: Abhishek --- .../fms_acceleration_peft/gptqmodel/models/granitemoe.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py index 6423163e..a15318c3 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py @@ -25,11 +25,6 @@ class GraniteMoeGPTQ(BaseGPTQModel): layer_modules = [ ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], ["self_attn.o_proj"], - [ - "block_sparse_moe.input_linear", - ], - [ - "block_sparse_moe.output_linear", - ], - ["block_sparse_moe.router.layer"], + ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear", "block_sparse_moe.router.layer"], + ["input_layernorm", "post_attention_layernorm"] ] From 56dfbefccaff023ca9626002a8d61482a437e2f6 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Tue, 7 Jan 2025 20:55:21 -0500 Subject: [PATCH 05/13] test adddition of granite moe Signed-off-by: Abhishek --- .../src/fms_acceleration_peft/gptqmodel/models/granitemoe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py index a15318c3..6552376a 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py @@ -25,6 +25,6 @@ class GraniteMoeGPTQ(BaseGPTQModel): layer_modules = [ ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], ["self_attn.o_proj"], - ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear", "block_sparse_moe.router.layer"], + ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"], ["input_layernorm", "post_attention_layernorm"] ] From c23816b2446aeb83c98ffa6e1cc65fb95b2ff2c6 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Thu, 23 Jan 2025 23:23:41 -0500 Subject: [PATCH 06/13] Merge and test Signed-off-by: Abhishek --- .../framework_plugin_scattermoe.py | 35 ++++++++++--------- plugins/accelerated-peft/pyproject.toml | 2 +- .../framework_plugin_autogptq.py | 2 +- .../framework_plugin_bnb.py | 2 +- .../gptqmodel/quantization/gptq.py | 22 ++++++++++++ .../gptqmodel/utils/model.py | 6 ++++ .../tests/test_peft_plugins.py | 8 ++--- .../pyproject.toml | 2 +- .../framework_plugin_multipack.py | 2 +- .../framework_plugin_padding_free.py | 2 +- plugins/framework/README.md | 2 +- plugins/framework/pyproject.toml | 2 +- .../src/fms_acceleration/framework.py | 8 ++--- .../src/fms_acceleration/framework_plugin.py | 2 +- .../src/fms_acceleration/utils/test_utils.py | 10 +++--- plugins/framework/tests/test_framework.py | 34 +++++++++--------- plugins/fused-ops-and-kernels/pyproject.toml | 2 +- .../framework_plugin_fast_kernels.py | 2 +- .../tests/test_foak_plugins.py | 2 +- 19 files changed, 88 insertions(+), 59 deletions(-) diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py index 148a5488..528693ea 100644 --- a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py +++ b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py @@ -13,11 +13,12 @@ # limitations under the License. # Standard -from typing import Dict +from typing import Dict, Tuple # Third Party from fms_acceleration import AccelerationPlugin -from transformers import AutoModelForCausalLM +from peft import LoraConfig +from transformers import TrainingArguments import torch # Local @@ -52,21 +53,27 @@ def __init__(self, configurations: Dict[str, Dict]): ) @property - def requires_custom_loading(self): + def requires_augmentation(self): return True - def model_loader(self, model_name: str, **kwargs): - - # load the model - model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs) - + def augmentation( + self, + model, + train_args: TrainingArguments, + modifiable_args: Tuple[LoraConfig], + ): rank, world_size = 0, 1 if torch.distributed.is_initialized(): world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() - # shard the MOE, and store the component names, eventually needed - # to configure the FSDP + if not hasattr(model.config, "name_or_path") or not model.config.name_or_path: + raise ValueError( + "The model configuration is missing the 'name_or_path' attribute." + ) + + model_name = model.config.name_or_path + self._moe_component_module_names = prepare_scattermoe( model, checkpoint_name_or_path=model_name, @@ -75,13 +82,7 @@ def model_loader(self, model_name: str, **kwargs): ep_degree=self._ep_degree, mixed_precision=False, # Currently this is hardcoded to OFF ) - - # NOTE: there is currently no good way to get the mixed precision - # flag from train_args. It will be better to handle this if - # when we move the sharding to augmentation. - # https://github.com/foundation-model-stack/fms-acceleration/issues/103 - - return model + return model, modifiable_args def get_callbacks_and_ready_for_train( self, model: torch.nn.Module = None, accelerator=None diff --git a/plugins/accelerated-peft/pyproject.toml b/plugins/accelerated-peft/pyproject.toml index e6e545c7..a7dc4464 100644 --- a/plugins/accelerated-peft/pyproject.toml +++ b/plugins/accelerated-peft/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "fms-acceleration-peft" -version = '0.3.5.dev' +version = '0.4.0.dev' description = "FMS Acceleration for PeFT" authors = [ {name = "Fabian Lim", email = "flim@sg.ibm.com"}, diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py index 41ea2d6f..99568a94 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py @@ -241,7 +241,7 @@ def requires_custom_loading(self): return True @property - def requires_agumentation(self): + def requires_augmentation(self): return True def augmentation( diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py index b7202add..1900002d 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py @@ -185,7 +185,7 @@ def requires_custom_loading(self): return True @property - def requires_agumentation(self): + def requires_augmentation(self): # will skip the augmentation if _no_peft_model == True return not self._no_peft_model diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py index 470ed3fb..70e1d505 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py @@ -30,6 +30,19 @@ def __init__(self, layer): W = W.flatten(1) if isinstance(self.layer, transformers.pytorch_utils.Conv1D): W = W.t() + + # Suppose your layer weight is [num_experts, out_features, in_features]. + original_shape = layer.weight.shape # e.g. (num_experts, out_features, in_features) + + if len(original_shape) == 3: + # Flatten to 2D so GPTQ can treat it as rows × cols + # rows = num_experts * out_features, cols = in_features + W = W.reshape(original_shape[0] * original_shape[1], original_shape[2]) + self._is_3d = True + self._original_shape = original_shape + else: + self._is_3d = False # 2D + self.rows = W.shape[0] self.columns = W.shape[1] self.H = torch.zeros((self.columns, self.columns), device=self.dev) @@ -196,6 +209,15 @@ def fasterquant( self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as( self.layer.weight.data ) + + # Q is 2D after the Cholesky-based quantization step + if self._is_3d: + # Reshape Q back to [num_experts, out_features, in_features] + Q = Q.reshape(self._original_shape) + + # Now assign it back to the parameter + self.layer.weight.data = Q.type_as(self.layer.weight.data) + if os.environ.get("DEBUG"): logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py index d51e0e60..d5bcc15c 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py @@ -106,6 +106,12 @@ def find_layers(module, layers=None, name=""): for layer in layers: if isinstance(module, layer): return {name: module} + + # ADD FOR module GraniteMoeParallelExperts: https://github.com/huggingface/transformers/blob/b5aaf875090388e2bbdbf2d8641ed7967365f435/src/transformers/models/granitemoe/modeling_granitemoe.py#L258C7-L258C32 + if hasattr(module, "weight") and isinstance(module.weight, torch.nn.Parameter): + if module.weight.ndim == 3: + return {name: module} + res = {} for name1, child in module.named_children(): res.update( diff --git a/plugins/accelerated-peft/tests/test_peft_plugins.py b/plugins/accelerated-peft/tests/test_peft_plugins.py index 38534d5d..d36b3ce8 100644 --- a/plugins/accelerated-peft/tests/test_peft_plugins.py +++ b/plugins/accelerated-peft/tests/test_peft_plugins.py @@ -54,7 +54,7 @@ def test_configure_gptq_plugin(): # check flags and callbacks assert framework.requires_custom_loading - assert framework.requires_agumentation + assert framework.requires_augmentation assert len(framework.get_callbacks_and_ready_for_train()) == 0 # attempt to activate plugin with configuration pointing to wrong path @@ -171,7 +171,7 @@ def test_configure_bnb_plugin(): # check flags and callbacks assert framework.requires_custom_loading - assert framework.requires_agumentation + assert framework.requires_augmentation assert len(framework.get_callbacks_and_ready_for_train()) == 0 # test valid combinatinos @@ -187,7 +187,7 @@ def test_configure_bnb_plugin(): ): # check flags and callbacks assert framework.requires_custom_loading - assert framework.requires_agumentation + assert framework.requires_augmentation assert len(framework.get_callbacks_and_ready_for_train()) == 0 # test no_peft_model is true skips plugin.augmentation @@ -202,7 +202,7 @@ def test_configure_bnb_plugin(): require_packages_check=False, ): # check flags and callbacks - assert (not correct_value) == framework.requires_agumentation + assert (not correct_value) == framework.requires_augmentation # attempt to activate plugin with configuration pointing to wrong path # - raise with message that no plugins can be configured diff --git a/plugins/attention-and-distributed-packing/pyproject.toml b/plugins/attention-and-distributed-packing/pyproject.toml index fdbb3ac1..e755ac56 100644 --- a/plugins/attention-and-distributed-packing/pyproject.toml +++ b/plugins/attention-and-distributed-packing/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "fms-acceleration-aadp" -version = '0.1.1.dev' +version = '0.2.0.dev' description = "FMS Acceleration Plugin for Attention and Distributed Packing Optimizations" authors = [ {name = "Fabian Lim", email = "flim@sg.ibm.com"}, diff --git a/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_multipack.py b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_multipack.py index aa9134a6..391743c6 100644 --- a/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_multipack.py +++ b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_multipack.py @@ -61,7 +61,7 @@ def __init__( assert self._pad_token_id is not None, "need to get pad token id" @property - def requires_agumentation(self): + def requires_augmentation(self): return True def augmentation( diff --git a/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_padding_free.py b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_padding_free.py index 0e4e5ef9..596b5600 100644 --- a/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_padding_free.py +++ b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_padding_free.py @@ -41,7 +41,7 @@ def __init__(self, configurations: Dict[str, Dict]): ) @property - def requires_agumentation(self): + def requires_augmentation(self): return True def augmentation( diff --git a/plugins/framework/README.md b/plugins/framework/README.md index 5b3cbfd7..4895f322 100644 --- a/plugins/framework/README.md +++ b/plugins/framework/README.md @@ -45,7 +45,7 @@ model, (peft_config,) = framework.augmentation( ) ``` -We also provide `framework.requires_agumentation` to check if augumentation is required by the plugins. +We also provide `framework.requires_augmentation` to check if augumentation is required by the plugins. Finally pass the model to train: diff --git a/plugins/framework/pyproject.toml b/plugins/framework/pyproject.toml index c57e5f01..bb481c50 100644 --- a/plugins/framework/pyproject.toml +++ b/plugins/framework/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "fms-acceleration" -version = '0.5.0.dev' +version = '0.6.0.dev' description = "FMS Acceleration Plugin Framework" authors = [ {name = "Fabian Lim", email = "flim@sg.ibm.com"}, diff --git a/plugins/framework/src/fms_acceleration/framework.py b/plugins/framework/src/fms_acceleration/framework.py index 3a393815..75b436c9 100644 --- a/plugins/framework/src/fms_acceleration/framework.py +++ b/plugins/framework/src/fms_acceleration/framework.py @@ -199,10 +199,10 @@ def augmentation( x in model_archs for x in plugin.restricted_model_archs ): raise ValueError( - f"Model architectures in '{model_archs}' are supported for '{plugin_name}'." + f"Model architectures in '{model_archs}' are not supported for '{plugin_name}'." ) - if plugin.requires_agumentation: + if plugin.requires_augmentation: model, modifiable_args = plugin.augmentation( model, train_args, modifiable_args=modifiable_args ) @@ -214,8 +214,8 @@ def requires_custom_loading(self): return len(self.plugins_require_custom_loading) > 0 @property - def requires_agumentation(self): - return any(x.requires_agumentation for _, x in self.active_plugins) + def requires_augmentation(self): + return any(x.requires_augmentation for _, x in self.active_plugins) def get_callbacks_and_ready_for_train( self, model: torch.nn.Module = None, accelerator: Accelerator = None diff --git a/plugins/framework/src/fms_acceleration/framework_plugin.py b/plugins/framework/src/fms_acceleration/framework_plugin.py index 28fecebf..94ea4ffa 100644 --- a/plugins/framework/src/fms_acceleration/framework_plugin.py +++ b/plugins/framework/src/fms_acceleration/framework_plugin.py @@ -171,7 +171,7 @@ def requires_custom_loading(self): return False @property - def requires_agumentation(self): + def requires_augmentation(self): return False def model_loader(self, model_name: str, **kwargs): diff --git a/plugins/framework/src/fms_acceleration/utils/test_utils.py b/plugins/framework/src/fms_acceleration/utils/test_utils.py index b1f731d1..6a3bc123 100644 --- a/plugins/framework/src/fms_acceleration/utils/test_utils.py +++ b/plugins/framework/src/fms_acceleration/utils/test_utils.py @@ -159,8 +159,8 @@ def create_plugin_cls( restricted_models: Set = None, require_pkgs: Set = None, requires_custom_loading: bool = False, - requires_agumentation: bool = False, - agumentation: Callable = None, + requires_augmentation: bool = False, + augmentation: Callable = None, model_loader: Callable = None, ): "helper function to create plugin class" @@ -174,11 +174,11 @@ def create_plugin_cls( "restricted_model_archs": restricted_models, "require_packages": require_pkgs, "requires_custom_loading": requires_custom_loading, - "requires_agumentation": requires_agumentation, + "requires_augmentation": requires_augmentation, } - if agumentation is not None: - attributes["augmentation"] = agumentation + if augmentation is not None: + attributes["augmentation"] = augmentation if model_loader is not None: attributes["model_loader"] = model_loader diff --git a/plugins/framework/tests/test_framework.py b/plugins/framework/tests/test_framework.py index 4fd43eb2..b3f4eb9e 100644 --- a/plugins/framework/tests/test_framework.py +++ b/plugins/framework/tests/test_framework.py @@ -68,7 +68,7 @@ def test_model_with_no_config_raises(): # create model and (incomplete) plugin with requires_augmentation = True model_no_config = torch.nn.Module() # empty model - incomplete_plugin = create_plugin_cls(requires_agumentation=True) + incomplete_plugin = create_plugin_cls(requires_augmentation=True) # register and activate 1 incomplete plugin, and: # 1. test correct plugin registration and activation. @@ -104,13 +104,13 @@ def test_single_plugin(): empty_plugin = create_plugin_cls() incomplete_plugin = create_plugin_cls( restricted_models={"CausalLM"}, - requires_agumentation=True, + requires_augmentation=True, ) plugin = create_plugin_cls( restricted_models={"CausalLM"}, - requires_agumentation=True, + requires_augmentation=True, requires_custom_loading=True, - agumentation=dummy_augmentation, + augmentation=dummy_augmentation, model_loader=dummy_custom_loader, ) train_args = None # dummy for now @@ -175,32 +175,32 @@ def test_two_plugins(): model = create_noop_model_with_archs(archs=["CausalLM"]) incomp_plugin1 = create_plugin_cls( - restricted_models={"CausalLM"}, requires_agumentation=True + restricted_models={"CausalLM"}, requires_augmentation=True ) - incomp_plugin2 = create_plugin_cls(requires_agumentation=True) + incomp_plugin2 = create_plugin_cls(requires_augmentation=True) incomp_plugin3 = create_plugin_cls( - class_name="PluginNoop2", requires_agumentation=True + class_name="PluginNoop2", requires_augmentation=True ) plugin1 = create_plugin_cls( restricted_models={"CausalLM"}, - requires_agumentation=True, + requires_augmentation=True, requires_custom_loading=True, - agumentation=dummy_augmentation, + augmentation=dummy_augmentation, model_loader=dummy_custom_loader, ) plugin2 = create_plugin_cls( class_name="PluginNoop2", restricted_models={"CausalLM"}, - requires_agumentation=True, + requires_augmentation=True, requires_custom_loading=True, - agumentation=dummy_augmentation, + augmentation=dummy_augmentation, model_loader=dummy_custom_loader, ) plugin3_no_loader = create_plugin_cls( class_name="PluginNoop2", restricted_models={"CausalLM"}, - requires_agumentation=True, - agumentation=dummy_augmentation, + requires_augmentation=True, + augmentation=dummy_augmentation, ) train_args = None # dummy for now @@ -299,8 +299,8 @@ def _hook( for class_name in ["PluginDEF", "PluginABC"]: plugin = create_plugin_cls( class_name=class_name, - requires_agumentation=True, - agumentation=hook_builder(act_order=plugin_activation_order), + requires_augmentation=True, + augmentation=hook_builder(act_order=plugin_activation_order), ) plugins_to_be_installed.append((class_name, plugin)) @@ -319,8 +319,8 @@ def test_plugin_registration_combination_logic(): plugin = create_plugin_cls( restricted_models={"CausalLM"}, - requires_agumentation=True, - agumentation=dummy_augmentation, + requires_augmentation=True, + augmentation=dummy_augmentation, ) configuration_contents = {"existing1": {"key1": 1}, "existing2": {"key1": 1}} diff --git a/plugins/fused-ops-and-kernels/pyproject.toml b/plugins/fused-ops-and-kernels/pyproject.toml index 5a003712..5bb23e18 100644 --- a/plugins/fused-ops-and-kernels/pyproject.toml +++ b/plugins/fused-ops-and-kernels/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "fms-acceleration-foak" -version = '0.4.0.dev' +version = '0.5.0.dev' description = "FMS Acceleration using Fused Operations and Kernels" authors = [ {name = "Fabian Lim", email = "flim@sg.ibm.com"}, diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py index df21fd5c..0d7ce802 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py @@ -128,7 +128,7 @@ def __init__(self, configurations: Dict[str, Dict]): ) @property - def requires_agumentation(self): + def requires_augmentation(self): return True def augmentation( diff --git a/plugins/fused-ops-and-kernels/tests/test_foak_plugins.py b/plugins/fused-ops-and-kernels/tests/test_foak_plugins.py index 11e91ff6..9d1c0c97 100644 --- a/plugins/fused-ops-and-kernels/tests/test_foak_plugins.py +++ b/plugins/fused-ops-and-kernels/tests/test_foak_plugins.py @@ -47,7 +47,7 @@ def test_configure_gptq_foak_plugin(): # check flags and callbacks assert framework.requires_custom_loading is False - assert framework.requires_agumentation + assert framework.requires_augmentation assert len(framework.get_callbacks_and_ready_for_train()) == 0 # attempt to activate plugin with configuration pointing to wrong path From cff9a59c74f4b4a6873f8e77d29d14688f8351ae Mon Sep 17 00:00:00 2001 From: Abhishek Date: Mon, 27 Jan 2025 22:29:29 -0500 Subject: [PATCH 07/13] Changes for __init__ and find_layers Signed-off-by: Abhishek --- .../gptqmodel/quantization/gptq.py | 436 ++++++++++-------- .../gptqmodel/utils/model.py | 17 +- 2 files changed, 263 insertions(+), 190 deletions(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py index 70e1d505..0ac9e203 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py @@ -11,6 +11,7 @@ import torch import torch.nn as nn import transformers +import transformers.models.granitemoe.modeling_granitemoe as MOE # Local from .quantizer import Quantizer @@ -25,59 +26,112 @@ class GPTQ: def __init__(self, layer): self.layer = layer self.dev = self.layer.weight.device - W = layer.weight.data.clone() - if isinstance(self.layer, nn.Conv2d): - W = W.flatten(1) - if isinstance(self.layer, transformers.pytorch_utils.Conv1D): - W = W.t() - - # Suppose your layer weight is [num_experts, out_features, in_features]. - original_shape = layer.weight.shape # e.g. (num_experts, out_features, in_features) - - if len(original_shape) == 3: - # Flatten to 2D so GPTQ can treat it as rows × cols - # rows = num_experts * out_features, cols = in_features - W = W.reshape(original_shape[0] * original_shape[1], original_shape[2]) - self._is_3d = True - self._original_shape = original_shape - else: - self._is_3d = False # 2D - - self.rows = W.shape[0] - self.columns = W.shape[1] - self.H = torch.zeros((self.columns, self.columns), device=self.dev) - self.nsamples = 0 self.quantizer = Quantizer() + # print("GPTQ INIT: layer, type(layer)", layer, type(layer)) + # print("GPTQ INIT: layer.weight.data", layer.weight.data) + + + if isinstance(layer, MOE.GraniteMoeParallelExperts): + self.is_moe = True + self.num_experts = layer.num_experts + self.out_features = layer.output_size + self.in_features = layer.input_size + # print("GPTQ INIT: self.num_experts, self.out_features, self.in_features", self.num_experts, self.out_features, self.in_features) + + # Separate W for each expert + self.W_list, self.H_list, self.nsamples_list = [], [], [] + for i in range(self.num_experts): + # Each expert slice is of shape [out_features, in_features] + self.W_list.append(layer.weight.data[i].clone()) + + # For each expert param, we have a Hessian and sample count + self.H_list.append(torch.zeros((self.in_features, self.in_features), device=self.dev)) + self.nsamples_list.append(0) + + # print("GPTQ INIT: self.W_list, len(self.W_list), type(self.W_list[0]), self.W_list[0].size(), self.W_list[0].dim()", self.W_list, len(self.W_list), type(self.W_list[0]), self.W_list[0].size(), self.W_list[0].dim()) + # print("GPTQ INIT: self.H_list, len(self.H_list), type(self.H_list[0]), self.H_list[0].size(), self.H_list[0].dim()", self.H_list, len(self.H_list), type(self.H_list[0]), self.H_list[0].size(), self.H_list[0].dim()) + + else: + # For 2D layer (linear, conv, etc.), we have a single Hessian and sample count + self.is_moe = False + W = layer.weight.data.clone() + if isinstance(layer, nn.Conv2d): + W = W.flatten(1) + if isinstance(layer, transformers.pytorch_utils.Conv1D): + W = W.t() + # print("GPTQ INIT: W, type(W), W.size(), W.dim()", W, type(W), W.size(), W.dim()) + + self.rows = W.shape[0] + self.columns = W.shape[1] + self.H = torch.zeros((self.columns, self.columns), device=self.dev) + self.nsamples = 0 + # print("GPTQ INIT: self.H, type(self.H), self.H.size(), self.H.dim()", self.H, type(self.H), self.H.size(), self.H.dim()) + # print("GPTQ INIT: self.rows, self.columns", self.rows, self.columns) + def add_batch(self, inp, out): if os.environ.get("DEBUG"): self.inp1 = inp self.out1 = out - if len(inp.shape) == 2: - inp = inp.unsqueeze(0) - tmp = inp.shape[0] - if isinstance(self.layer, nn.Linear) or isinstance( - self.layer, transformers.Conv1D - ): - if len(inp.shape) == 3: - inp = inp.reshape((-1, inp.shape[-1])) - inp = inp.t() - if isinstance(self.layer, nn.Conv2d): - unfold = nn.Unfold( - self.layer.kernel_size, - dilation=self.layer.dilation, - padding=self.layer.padding, - stride=self.layer.stride, - ) - inp = unfold(inp) - inp = inp.permute([1, 0, 2]) - inp = inp.flatten(1) - self.H *= self.nsamples / (self.nsamples + tmp) - self.nsamples += tmp - # inp = inp.float() - inp = math.sqrt(2 / self.nsamples) * inp.float() - # self.H += 2 / self.nsamples * inp.matmul(inp.t()) - self.H += inp.matmul(inp.t()) + # Update entire H_list and nsamples_list + if self.is_moe: + # print("INSIDE ADD_BATCH FOR MOE: inp, type(inp), inp.shape", inp, type(inp), inp.shape) + for expert_idx in range(self.num_experts): + H = self.H_list[expert_idx] + nsamples = self.nsamples_list[expert_idx] + + # if len(inp.shape) == 2: + # inp = inp.unsqueeze(0) + # tmp = inp.shape[0] + # print("INSIDE ADD_BATCH FOR MOE 2: inp, inp.shape, tmp", inp, inp.shape, tmp) + + # Below is doing reverse of above + # if len(inp.shape) == 3: + # inp = inp.reshape((-1, inp.shape[-1])) + # print("INSIDE ADD_BATCH FOR MOE 3: inp, inp.shape, tmp", inp, inp.shape, tmp) + + tmp = 1 + # len(inp.shape) == 2 in this case + mod_inp = inp.t() + # print("INSIDE ADD_BATCH FOR MOE 4: inp, inp.shape, tmp", inp, inp.shape, tmp) + + H *= nsamples / (nsamples + tmp) + nsamples += tmp + mod_inp = math.sqrt(2 / nsamples) * mod_inp.float() + H += mod_inp.matmul(mod_inp.t()) + + self.H_list[expert_idx] = H + self.nsamples_list[expert_idx] = nsamples + else: + # print("INSIDE ADD_BATCH FOR 2D") + # print("INSIDE ADD_BATCH 1: inp", inp, type(inp), inp.shape) + if len(inp.shape) == 2: + inp = inp.unsqueeze(0) + tmp = inp.shape[0] + if isinstance(self.layer, nn.Linear) or isinstance( + self.layer, transformers.Conv1D + ): + if len(inp.shape) == 3: + inp = inp.reshape((-1, inp.shape[-1])) + inp = inp.t() + if isinstance(self.layer, nn.Conv2d): + unfold = nn.Unfold( + self.layer.kernel_size, + dilation=self.layer.dilation, + padding=self.layer.padding, + stride=self.layer.stride, + ) + inp = unfold(inp) + inp = inp.permute([1, 0, 2]) + inp = inp.flatten(1) + # print("INSIDE ADD_BATCH 2: BEFORE tmp, self.H, self.nsamples", tmp, self.H, self.nsamples) + self.H *= self.nsamples / (self.nsamples + tmp) + self.nsamples += tmp + # print("INSIDE ADD_BATCH 3: AFTER tmp, self.H, self.nsamples", tmp, self.H, self.nsamples) + # inp = inp.float() + inp = math.sqrt(2 / self.nsamples) * inp.float() + # self.H += 2 / self.nsamples * inp.matmul(inp.t()) + self.H += inp.matmul(inp.t()) def fasterquant( self, @@ -87,146 +141,162 @@ def fasterquant( actorder=False, static_groups=False, ): - W = self.layer.weight.data.clone() - if isinstance(self.layer, nn.Conv2d): - W = W.flatten(1) - if isinstance(self.layer, transformers.Conv1D): - W = W.t() - W = W.float() - - tick = time.time() - - if not self.quantizer.ready(): - self.quantizer.find_params(W, weight=True) - - H = self.H - del self.H - dead = torch.diag(H) == 0 - H[dead, dead] = 1 - W[:, dead] = 0 - - g_idx = [] - scale = [] - zero = [] - now_idx = 1 - - if static_groups: - # Standard - import copy - - groups = [] - for i in range(0, self.columns, group_size): - quantizer = copy.deepcopy(self.quantizer) - quantizer.find_params(W[:, i : (i + group_size)], weight=True) - scale.append(quantizer.scale) - zero.append(quantizer.zero) - groups.append(quantizer) - - if actorder: - perm = torch.argsort(torch.diag(H), descending=True) - W = W[:, perm] - H = H[perm][:, perm] - invperm = torch.argsort(perm) - - Losses = torch.zeros_like(W) - Q = torch.zeros_like(W) - - damp = percdamp * torch.mean(torch.diag(H)) - diag = torch.arange(self.columns, device=self.dev) - H[diag, diag] += damp - H = torch.linalg.cholesky(H) - H = torch.cholesky_inverse(H) - H = torch.linalg.cholesky(H, upper=True) - Hinv = H - - for i1 in range(0, self.columns, blocksize): - i2 = min(i1 + blocksize, self.columns) - count = i2 - i1 - - W1 = W[:, i1:i2].clone() - Q1 = torch.zeros_like(W1) - Err1 = torch.zeros_like(W1) - Losses1 = torch.zeros_like(W1) - Hinv1 = Hinv[i1:i2, i1:i2] - - for i in range(count): - w = W1[:, i] - d = Hinv1[i, i] - - if group_size != -1: - if not static_groups: - if (i1 + i) % group_size == 0: - self.quantizer.find_params( - W[:, (i1 + i) : (i1 + i + group_size)], weight=True - ) - - if ((i1 + i) // group_size) - now_idx == -1: - scale.append(self.quantizer.scale) - zero.append(self.quantizer.zero) - now_idx += 1 - else: - idx = i1 + i - if actorder: - idx = perm[idx] - self.quantizer = groups[idx // group_size] - - q = self.quantizer.quantize(w.unsqueeze(1)).flatten() - Q1[:, i] = q - Losses1[:, i] = (w - q) ** 2 / d**2 - - err1 = (w - q) / d - W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0)) - Err1[:, i] = err1 - - Q[:, i1:i2] = Q1 - Losses[:, i1:i2] = Losses1 / 2 - - W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:]) + if self.is_moe: + # For MoE model + # Loop over each expert param and quantize it separately + t_start = time.time() + scale_list = [] + zero_list = [] + gidx_list = [] + loss_list = [] + + for i in range(self.num_experts): + W = self.W_list[i] # shape [out_features, in_features] + H = self.H_list[i] + nsamples = self.nsamples_list[i] + + ##### TODO: QUANTIZATION FOR MOE LAYER ##### + + duration = time.time() - t_start + final_scale = torch.cat(scale_list, dim=1) if scale_list else torch.tensor([], device=self.dev) + final_zero = torch.cat(zero_list, dim=1) if zero_list else torch.tensor([], device=self.dev) + final_gidx = torch.cat(gidx_list, dim=0) if gidx_list else torch.tensor([], device=self.dev) + avg_loss = sum(loss_list) / (len(loss_list) + 1e-9) + + return final_scale, final_zero, final_gidx, duration, avg_loss + else: + W = self.layer.weight.data.clone() + if isinstance(self.layer, nn.Conv2d): + W = W.flatten(1) + if isinstance(self.layer, transformers.Conv1D): + W = W.t() + W = W.float() + + tick = time.time() + + if not self.quantizer.ready(): + self.quantizer.find_params(W, weight=True) + + H = self.H + del self.H + dead = torch.diag(H) == 0 + H[dead, dead] = 1 + W[:, dead] = 0 + + g_idx = [] + scale = [] + zero = [] + now_idx = 1 + + if static_groups: + # Standard + import copy + + groups = [] + for i in range(0, self.columns, group_size): + quantizer = copy.deepcopy(self.quantizer) + quantizer.find_params(W[:, i : (i + group_size)], weight=True) + scale.append(quantizer.scale) + zero.append(quantizer.zero) + groups.append(quantizer) + + if actorder: + perm = torch.argsort(torch.diag(H), descending=True) + W = W[:, perm] + H = H[perm][:, perm] + invperm = torch.argsort(perm) + + Losses = torch.zeros_like(W) + Q = torch.zeros_like(W) + + damp = percdamp * torch.mean(torch.diag(H)) + diag = torch.arange(self.columns, device=self.dev) + H[diag, diag] += damp + H = torch.linalg.cholesky(H) + H = torch.cholesky_inverse(H) + H = torch.linalg.cholesky(H, upper=True) + Hinv = H + + for i1 in range(0, self.columns, blocksize): + i2 = min(i1 + blocksize, self.columns) + count = i2 - i1 + + W1 = W[:, i1:i2].clone() + Q1 = torch.zeros_like(W1) + Err1 = torch.zeros_like(W1) + Losses1 = torch.zeros_like(W1) + Hinv1 = Hinv[i1:i2, i1:i2] + + for i in range(count): + w = W1[:, i] + d = Hinv1[i, i] + + if group_size != -1: + if not static_groups: + if (i1 + i) % group_size == 0: + self.quantizer.find_params( + W[:, (i1 + i) : (i1 + i + group_size)], weight=True + ) + + if ((i1 + i) // group_size) - now_idx == -1: + scale.append(self.quantizer.scale) + zero.append(self.quantizer.zero) + now_idx += 1 + else: + idx = i1 + i + if actorder: + idx = perm[idx] + self.quantizer = groups[idx // group_size] + + q = self.quantizer.quantize(w.unsqueeze(1)).flatten() + Q1[:, i] = q + Losses1[:, i] = (w - q) ** 2 / d**2 + + err1 = (w - q) / d + W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0)) + Err1[:, i] = err1 + + Q[:, i1:i2] = Q1 + Losses[:, i1:i2] = Losses1 / 2 + + W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:]) + + if os.environ.get("DEBUG"): + self.layer.weight.data[:, :i2] = Q[:, :i2] + self.layer.weight.data[:, i2:] = W[:, i2:] + logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + logger.debug(torch.sum(Losses)) + + torch.cuda.synchronize() + + duration = time.time() - tick + avg_loss = torch.sum(Losses).item() / self.nsamples + + group_size = group_size if group_size != -1 else self.columns + if static_groups and actorder: + g_idx = [perm[i] // group_size for i in range(self.columns)] + else: + g_idx = [i // group_size for i in range(self.columns)] + g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device) + if actorder: + Q = Q[:, invperm] + g_idx = g_idx[invperm] + + if isinstance(self.layer, transformers.Conv1D): + Q = Q.t() + self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as( + self.layer.weight.data + ) if os.environ.get("DEBUG"): - self.layer.weight.data[:, :i2] = Q[:, :i2] - self.layer.weight.data[:, i2:] = W[:, i2:] logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) - logger.debug(torch.sum(Losses)) - - torch.cuda.synchronize() - - duration = time.time() - tick - avg_loss = torch.sum(Losses).item() / self.nsamples - group_size = group_size if group_size != -1 else self.columns - if static_groups and actorder: - g_idx = [perm[i] // group_size for i in range(self.columns)] - else: - g_idx = [i // group_size for i in range(self.columns)] - g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device) - if actorder: - Q = Q[:, invperm] - g_idx = g_idx[invperm] - - if isinstance(self.layer, transformers.Conv1D): - Q = Q.t() - self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as( - self.layer.weight.data - ) - - # Q is 2D after the Cholesky-based quantization step - if self._is_3d: - # Reshape Q back to [num_experts, out_features, in_features] - Q = Q.reshape(self._original_shape) - - # Now assign it back to the parameter - self.layer.weight.data = Q.type_as(self.layer.weight.data) - - if os.environ.get("DEBUG"): - logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) - - if scale == []: - scale.append(self.quantizer.scale) - zero.append(self.quantizer.zero) - scale = torch.cat(scale, dim=1) - zero = torch.cat(zero, dim=1) - return scale, zero, g_idx, duration, avg_loss + if scale == []: + scale.append(self.quantizer.scale) + zero.append(self.quantizer.zero) + scale = torch.cat(scale, dim=1) + zero = torch.cat(zero, dim=1) + return scale, zero, g_idx, duration, avg_loss def free(self): if os.environ.get("DEBUG"): diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py index d5bcc15c..39715177 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py @@ -31,6 +31,7 @@ import torch import torch.nn as nn import transformers +import transformers.models.granitemoe.modeling_granitemoe as MOE # Local from ..models._const import ( @@ -101,17 +102,19 @@ def nested_move_to(v, device): def find_layers(module, layers=None, name=""): + # print("1- INSIDE find_layers module", module) if not layers: - layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear] + # Can add MOE.GraniteMoeRMSNorm here if want to include Linear Norm layer ["input_layernorm", "post_attention_layernorm"] + # MOE.GraniteMoeParallelExperts is torch.nn.Module for layer ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"] + layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear, MOE.GraniteMoeParallelExperts] + + # print("2- LAYERS, type(module), name", layers, type(module), name) + # if hasattr(module, "weight"): + # print("3- module.weight, type(module.weight), module.weight.shape, module.weight.ndim", module.weight, type(module.weight), module.weight.shape, module.weight.ndim) for layer in layers: if isinstance(module, layer): return {name: module} - - # ADD FOR module GraniteMoeParallelExperts: https://github.com/huggingface/transformers/blob/b5aaf875090388e2bbdbf2d8641ed7967365f435/src/transformers/models/granitemoe/modeling_granitemoe.py#L258C7-L258C32 - if hasattr(module, "weight") and isinstance(module.weight, torch.nn.Parameter): - if module.weight.ndim == 3: - return {name: module} - + res = {} for name1, child in module.named_children(): res.update( From 3cd53eb7bd1e646f06b924e7b7b2a74b2d8242d4 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Mon, 27 Jan 2025 22:33:41 -0500 Subject: [PATCH 08/13] Changes for __init__ and find_layers Signed-off-by: Abhishek --- .../gptqmodel/quantization/gptq.py | 107 +++++++++--------- 1 file changed, 54 insertions(+), 53 deletions(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py index 0ac9e203..63b47b5c 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py @@ -74,35 +74,7 @@ def add_batch(self, inp, out): self.inp1 = inp self.out1 = out # Update entire H_list and nsamples_list - if self.is_moe: - # print("INSIDE ADD_BATCH FOR MOE: inp, type(inp), inp.shape", inp, type(inp), inp.shape) - for expert_idx in range(self.num_experts): - H = self.H_list[expert_idx] - nsamples = self.nsamples_list[expert_idx] - - # if len(inp.shape) == 2: - # inp = inp.unsqueeze(0) - # tmp = inp.shape[0] - # print("INSIDE ADD_BATCH FOR MOE 2: inp, inp.shape, tmp", inp, inp.shape, tmp) - - # Below is doing reverse of above - # if len(inp.shape) == 3: - # inp = inp.reshape((-1, inp.shape[-1])) - # print("INSIDE ADD_BATCH FOR MOE 3: inp, inp.shape, tmp", inp, inp.shape, tmp) - - tmp = 1 - # len(inp.shape) == 2 in this case - mod_inp = inp.t() - # print("INSIDE ADD_BATCH FOR MOE 4: inp, inp.shape, tmp", inp, inp.shape, tmp) - - H *= nsamples / (nsamples + tmp) - nsamples += tmp - mod_inp = math.sqrt(2 / nsamples) * mod_inp.float() - H += mod_inp.matmul(mod_inp.t()) - - self.H_list[expert_idx] = H - self.nsamples_list[expert_idx] = nsamples - else: + if not self.is_moe: # print("INSIDE ADD_BATCH FOR 2D") # print("INSIDE ADD_BATCH 1: inp", inp, type(inp), inp.shape) if len(inp.shape) == 2: @@ -132,6 +104,35 @@ def add_batch(self, inp, out): inp = math.sqrt(2 / self.nsamples) * inp.float() # self.H += 2 / self.nsamples * inp.matmul(inp.t()) self.H += inp.matmul(inp.t()) + else: + # print("INSIDE ADD_BATCH FOR MOE: inp, type(inp), inp.shape", inp, type(inp), inp.shape) + for expert_idx in range(self.num_experts): + H = self.H_list[expert_idx] + nsamples = self.nsamples_list[expert_idx] + + # if len(inp.shape) == 2: + # inp = inp.unsqueeze(0) + # tmp = inp.shape[0] + # print("INSIDE ADD_BATCH FOR MOE 2: inp, inp.shape, tmp", inp, inp.shape, tmp) + + # Below is doing reverse of above + # if len(inp.shape) == 3: + # inp = inp.reshape((-1, inp.shape[-1])) + # print("INSIDE ADD_BATCH FOR MOE 3: inp, inp.shape, tmp", inp, inp.shape, tmp) + + tmp = 1 + # len(inp.shape) == 2 in this case + mod_inp = inp.t() + # print("INSIDE ADD_BATCH FOR MOE 4: inp, inp.shape, tmp", inp, inp.shape, tmp) + + H *= nsamples / (nsamples + tmp) + nsamples += tmp + mod_inp = math.sqrt(2 / nsamples) * mod_inp.float() + H += mod_inp.matmul(mod_inp.t()) + + self.H_list[expert_idx] = H + self.nsamples_list[expert_idx] = nsamples + def fasterquant( self, @@ -141,30 +142,7 @@ def fasterquant( actorder=False, static_groups=False, ): - if self.is_moe: - # For MoE model - # Loop over each expert param and quantize it separately - t_start = time.time() - scale_list = [] - zero_list = [] - gidx_list = [] - loss_list = [] - - for i in range(self.num_experts): - W = self.W_list[i] # shape [out_features, in_features] - H = self.H_list[i] - nsamples = self.nsamples_list[i] - - ##### TODO: QUANTIZATION FOR MOE LAYER ##### - - duration = time.time() - t_start - final_scale = torch.cat(scale_list, dim=1) if scale_list else torch.tensor([], device=self.dev) - final_zero = torch.cat(zero_list, dim=1) if zero_list else torch.tensor([], device=self.dev) - final_gidx = torch.cat(gidx_list, dim=0) if gidx_list else torch.tensor([], device=self.dev) - avg_loss = sum(loss_list) / (len(loss_list) + 1e-9) - - return final_scale, final_zero, final_gidx, duration, avg_loss - else: + if not self.is_moe: W = self.layer.weight.data.clone() if isinstance(self.layer, nn.Conv2d): W = W.flatten(1) @@ -297,6 +275,29 @@ def fasterquant( scale = torch.cat(scale, dim=1) zero = torch.cat(zero, dim=1) return scale, zero, g_idx, duration, avg_loss + else: + # For MoE model + # Loop over each expert param and quantize it separately + t_start = time.time() + scale_list = [] + zero_list = [] + gidx_list = [] + loss_list = [] + + for i in range(self.num_experts): + W = self.W_list[i] # shape [out_features, in_features] + H = self.H_list[i] + nsamples = self.nsamples_list[i] + + ##### TODO: QUANTIZATION FOR MOE LAYER ##### + + duration = time.time() - t_start + final_scale = torch.cat(scale_list, dim=1) if scale_list else torch.tensor([], device=self.dev) + final_zero = torch.cat(zero_list, dim=1) if zero_list else torch.tensor([], device=self.dev) + final_gidx = torch.cat(gidx_list, dim=0) if gidx_list else torch.tensor([], device=self.dev) + avg_loss = sum(loss_list) / (len(loss_list) + 1e-9) + + return final_scale, final_zero, final_gidx, duration, avg_loss def free(self): if os.environ.get("DEBUG"): From 12b206a185953185f61c34ef9c5659f85e82b467 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Thu, 30 Jan 2025 00:27:34 -0500 Subject: [PATCH 09/13] Incremental change of ModuleList Signed-off-by: Abhishek --- .../gptqmodel/models/granitemoe.py | 5 +- .../gptqmodel/quantization/gptq.py | 415 +++++++----------- .../gptqmodel/utils/model.py | 47 +- 3 files changed, 206 insertions(+), 261 deletions(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py index 6552376a..22c21773 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py @@ -19,12 +19,13 @@ class GraniteMoeGPTQ(BaseGPTQModel): base_modules = ["model.embed_tokens", "model.norm"] + convert3dToModuleList = ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"] layers_node = "model.layers" layer_type = "GraniteMoeDecoderLayer" layer_modules = [ ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], ["self_attn.o_proj"], - ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"], - ["input_layernorm", "post_attention_layernorm"] + [f"block_sparse_moe.input_linear.{i}" for i in range(40)], + [f"block_sparse_moe.output_linear.{i}" for i in range(40)], ] diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py index 63b47b5c..c96a6d79 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py @@ -11,7 +11,6 @@ import torch import torch.nn as nn import transformers -import transformers.models.granitemoe.modeling_granitemoe as MOE # Local from .quantizer import Quantizer @@ -26,113 +25,46 @@ class GPTQ: def __init__(self, layer): self.layer = layer self.dev = self.layer.weight.device + W = layer.weight.data.clone() + if isinstance(self.layer, nn.Conv2d): + W = W.flatten(1) + if isinstance(self.layer, transformers.pytorch_utils.Conv1D): + W = W.t() + self.rows = W.shape[0] + self.columns = W.shape[1] + self.H = torch.zeros((self.columns, self.columns), device=self.dev) + self.nsamples = 0 self.quantizer = Quantizer() - # print("GPTQ INIT: layer, type(layer)", layer, type(layer)) - # print("GPTQ INIT: layer.weight.data", layer.weight.data) - - - if isinstance(layer, MOE.GraniteMoeParallelExperts): - self.is_moe = True - self.num_experts = layer.num_experts - self.out_features = layer.output_size - self.in_features = layer.input_size - # print("GPTQ INIT: self.num_experts, self.out_features, self.in_features", self.num_experts, self.out_features, self.in_features) - - # Separate W for each expert - self.W_list, self.H_list, self.nsamples_list = [], [], [] - for i in range(self.num_experts): - # Each expert slice is of shape [out_features, in_features] - self.W_list.append(layer.weight.data[i].clone()) - - # For each expert param, we have a Hessian and sample count - self.H_list.append(torch.zeros((self.in_features, self.in_features), device=self.dev)) - self.nsamples_list.append(0) - - # print("GPTQ INIT: self.W_list, len(self.W_list), type(self.W_list[0]), self.W_list[0].size(), self.W_list[0].dim()", self.W_list, len(self.W_list), type(self.W_list[0]), self.W_list[0].size(), self.W_list[0].dim()) - # print("GPTQ INIT: self.H_list, len(self.H_list), type(self.H_list[0]), self.H_list[0].size(), self.H_list[0].dim()", self.H_list, len(self.H_list), type(self.H_list[0]), self.H_list[0].size(), self.H_list[0].dim()) - - else: - # For 2D layer (linear, conv, etc.), we have a single Hessian and sample count - self.is_moe = False - W = layer.weight.data.clone() - if isinstance(layer, nn.Conv2d): - W = W.flatten(1) - if isinstance(layer, transformers.pytorch_utils.Conv1D): - W = W.t() - # print("GPTQ INIT: W, type(W), W.size(), W.dim()", W, type(W), W.size(), W.dim()) - - self.rows = W.shape[0] - self.columns = W.shape[1] - self.H = torch.zeros((self.columns, self.columns), device=self.dev) - self.nsamples = 0 - # print("GPTQ INIT: self.H, type(self.H), self.H.size(), self.H.dim()", self.H, type(self.H), self.H.size(), self.H.dim()) - # print("GPTQ INIT: self.rows, self.columns", self.rows, self.columns) - def add_batch(self, inp, out): if os.environ.get("DEBUG"): self.inp1 = inp self.out1 = out - # Update entire H_list and nsamples_list - if not self.is_moe: - # print("INSIDE ADD_BATCH FOR 2D") - # print("INSIDE ADD_BATCH 1: inp", inp, type(inp), inp.shape) - if len(inp.shape) == 2: - inp = inp.unsqueeze(0) - tmp = inp.shape[0] - if isinstance(self.layer, nn.Linear) or isinstance( - self.layer, transformers.Conv1D - ): - if len(inp.shape) == 3: - inp = inp.reshape((-1, inp.shape[-1])) - inp = inp.t() - if isinstance(self.layer, nn.Conv2d): - unfold = nn.Unfold( - self.layer.kernel_size, - dilation=self.layer.dilation, - padding=self.layer.padding, - stride=self.layer.stride, - ) - inp = unfold(inp) - inp = inp.permute([1, 0, 2]) - inp = inp.flatten(1) - # print("INSIDE ADD_BATCH 2: BEFORE tmp, self.H, self.nsamples", tmp, self.H, self.nsamples) - self.H *= self.nsamples / (self.nsamples + tmp) - self.nsamples += tmp - # print("INSIDE ADD_BATCH 3: AFTER tmp, self.H, self.nsamples", tmp, self.H, self.nsamples) - # inp = inp.float() - inp = math.sqrt(2 / self.nsamples) * inp.float() - # self.H += 2 / self.nsamples * inp.matmul(inp.t()) - self.H += inp.matmul(inp.t()) - else: - # print("INSIDE ADD_BATCH FOR MOE: inp, type(inp), inp.shape", inp, type(inp), inp.shape) - for expert_idx in range(self.num_experts): - H = self.H_list[expert_idx] - nsamples = self.nsamples_list[expert_idx] - - # if len(inp.shape) == 2: - # inp = inp.unsqueeze(0) - # tmp = inp.shape[0] - # print("INSIDE ADD_BATCH FOR MOE 2: inp, inp.shape, tmp", inp, inp.shape, tmp) - - # Below is doing reverse of above - # if len(inp.shape) == 3: - # inp = inp.reshape((-1, inp.shape[-1])) - # print("INSIDE ADD_BATCH FOR MOE 3: inp, inp.shape, tmp", inp, inp.shape, tmp) - - tmp = 1 - # len(inp.shape) == 2 in this case - mod_inp = inp.t() - # print("INSIDE ADD_BATCH FOR MOE 4: inp, inp.shape, tmp", inp, inp.shape, tmp) - - H *= nsamples / (nsamples + tmp) - nsamples += tmp - mod_inp = math.sqrt(2 / nsamples) * mod_inp.float() - H += mod_inp.matmul(mod_inp.t()) - - self.H_list[expert_idx] = H - self.nsamples_list[expert_idx] = nsamples - + if len(inp.shape) == 2: + inp = inp.unsqueeze(0) + tmp = inp.shape[0] + if isinstance(self.layer, nn.Linear) or isinstance( + self.layer, transformers.Conv1D + ): + if len(inp.shape) == 3: + inp = inp.reshape((-1, inp.shape[-1])) + inp = inp.t() + if isinstance(self.layer, nn.Conv2d): + unfold = nn.Unfold( + self.layer.kernel_size, + dilation=self.layer.dilation, + padding=self.layer.padding, + stride=self.layer.stride, + ) + inp = unfold(inp) + inp = inp.permute([1, 0, 2]) + inp = inp.flatten(1) + self.H *= self.nsamples / (self.nsamples + tmp) + self.nsamples += tmp + # inp = inp.float() + inp = math.sqrt(2 / self.nsamples) * inp.float() + # self.H += 2 / self.nsamples * inp.matmul(inp.t()) + self.H += inp.matmul(inp.t()) def fasterquant( self, @@ -142,162 +74,137 @@ def fasterquant( actorder=False, static_groups=False, ): - if not self.is_moe: - W = self.layer.weight.data.clone() - if isinstance(self.layer, nn.Conv2d): - W = W.flatten(1) - if isinstance(self.layer, transformers.Conv1D): - W = W.t() - W = W.float() - - tick = time.time() - - if not self.quantizer.ready(): - self.quantizer.find_params(W, weight=True) - - H = self.H - del self.H - dead = torch.diag(H) == 0 - H[dead, dead] = 1 - W[:, dead] = 0 - - g_idx = [] - scale = [] - zero = [] - now_idx = 1 - - if static_groups: - # Standard - import copy - - groups = [] - for i in range(0, self.columns, group_size): - quantizer = copy.deepcopy(self.quantizer) - quantizer.find_params(W[:, i : (i + group_size)], weight=True) - scale.append(quantizer.scale) - zero.append(quantizer.zero) - groups.append(quantizer) - - if actorder: - perm = torch.argsort(torch.diag(H), descending=True) - W = W[:, perm] - H = H[perm][:, perm] - invperm = torch.argsort(perm) - - Losses = torch.zeros_like(W) - Q = torch.zeros_like(W) - - damp = percdamp * torch.mean(torch.diag(H)) - diag = torch.arange(self.columns, device=self.dev) - H[diag, diag] += damp - H = torch.linalg.cholesky(H) - H = torch.cholesky_inverse(H) - H = torch.linalg.cholesky(H, upper=True) - Hinv = H - - for i1 in range(0, self.columns, blocksize): - i2 = min(i1 + blocksize, self.columns) - count = i2 - i1 - - W1 = W[:, i1:i2].clone() - Q1 = torch.zeros_like(W1) - Err1 = torch.zeros_like(W1) - Losses1 = torch.zeros_like(W1) - Hinv1 = Hinv[i1:i2, i1:i2] - - for i in range(count): - w = W1[:, i] - d = Hinv1[i, i] - - if group_size != -1: - if not static_groups: - if (i1 + i) % group_size == 0: - self.quantizer.find_params( - W[:, (i1 + i) : (i1 + i + group_size)], weight=True - ) - - if ((i1 + i) // group_size) - now_idx == -1: - scale.append(self.quantizer.scale) - zero.append(self.quantizer.zero) - now_idx += 1 - else: - idx = i1 + i - if actorder: - idx = perm[idx] - self.quantizer = groups[idx // group_size] - - q = self.quantizer.quantize(w.unsqueeze(1)).flatten() - Q1[:, i] = q - Losses1[:, i] = (w - q) ** 2 / d**2 - - err1 = (w - q) / d - W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0)) - Err1[:, i] = err1 - - Q[:, i1:i2] = Q1 - Losses[:, i1:i2] = Losses1 / 2 - - W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:]) - - if os.environ.get("DEBUG"): - self.layer.weight.data[:, :i2] = Q[:, :i2] - self.layer.weight.data[:, i2:] = W[:, i2:] - logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) - logger.debug(torch.sum(Losses)) - - torch.cuda.synchronize() - - duration = time.time() - tick - avg_loss = torch.sum(Losses).item() / self.nsamples - - group_size = group_size if group_size != -1 else self.columns - if static_groups and actorder: - g_idx = [perm[i] // group_size for i in range(self.columns)] - else: - g_idx = [i // group_size for i in range(self.columns)] - g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device) - if actorder: - Q = Q[:, invperm] - g_idx = g_idx[invperm] - - if isinstance(self.layer, transformers.Conv1D): - Q = Q.t() - self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as( - self.layer.weight.data - ) + W = self.layer.weight.data.clone() + if isinstance(self.layer, nn.Conv2d): + W = W.flatten(1) + if isinstance(self.layer, transformers.Conv1D): + W = W.t() + W = W.float() + + tick = time.time() + + if not self.quantizer.ready(): + self.quantizer.find_params(W, weight=True) + + H = self.H + del self.H + dead = torch.diag(H) == 0 + H[dead, dead] = 1 + W[:, dead] = 0 + + g_idx = [] + scale = [] + zero = [] + now_idx = 1 + + if static_groups: + # Standard + import copy + + groups = [] + for i in range(0, self.columns, group_size): + quantizer = copy.deepcopy(self.quantizer) + quantizer.find_params(W[:, i : (i + group_size)], weight=True) + scale.append(quantizer.scale) + zero.append(quantizer.zero) + groups.append(quantizer) + + if actorder: + perm = torch.argsort(torch.diag(H), descending=True) + W = W[:, perm] + H = H[perm][:, perm] + invperm = torch.argsort(perm) + + Losses = torch.zeros_like(W) + Q = torch.zeros_like(W) + + damp = percdamp * torch.mean(torch.diag(H)) + diag = torch.arange(self.columns, device=self.dev) + H[diag, diag] += damp + H = torch.linalg.cholesky(H) + H = torch.cholesky_inverse(H) + H = torch.linalg.cholesky(H, upper=True) + Hinv = H + + for i1 in range(0, self.columns, blocksize): + i2 = min(i1 + blocksize, self.columns) + count = i2 - i1 + + W1 = W[:, i1:i2].clone() + Q1 = torch.zeros_like(W1) + Err1 = torch.zeros_like(W1) + Losses1 = torch.zeros_like(W1) + Hinv1 = Hinv[i1:i2, i1:i2] + + for i in range(count): + w = W1[:, i] + d = Hinv1[i, i] + + if group_size != -1: + if not static_groups: + if (i1 + i) % group_size == 0: + self.quantizer.find_params( + W[:, (i1 + i) : (i1 + i + group_size)], weight=True + ) + + if ((i1 + i) // group_size) - now_idx == -1: + scale.append(self.quantizer.scale) + zero.append(self.quantizer.zero) + now_idx += 1 + else: + idx = i1 + i + if actorder: + idx = perm[idx] + self.quantizer = groups[idx // group_size] + + q = self.quantizer.quantize(w.unsqueeze(1)).flatten() + Q1[:, i] = q + Losses1[:, i] = (w - q) ** 2 / d**2 + + err1 = (w - q) / d + W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0)) + Err1[:, i] = err1 + + Q[:, i1:i2] = Q1 + Losses[:, i1:i2] = Losses1 / 2 + + W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:]) if os.environ.get("DEBUG"): + self.layer.weight.data[:, :i2] = Q[:, :i2] + self.layer.weight.data[:, i2:] = W[:, i2:] logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + logger.debug(torch.sum(Losses)) + + torch.cuda.synchronize() + + duration = time.time() - tick + avg_loss = torch.sum(Losses).item() / self.nsamples - if scale == []: - scale.append(self.quantizer.scale) - zero.append(self.quantizer.zero) - scale = torch.cat(scale, dim=1) - zero = torch.cat(zero, dim=1) - return scale, zero, g_idx, duration, avg_loss + group_size = group_size if group_size != -1 else self.columns + if static_groups and actorder: + g_idx = [perm[i] // group_size for i in range(self.columns)] else: - # For MoE model - # Loop over each expert param and quantize it separately - t_start = time.time() - scale_list = [] - zero_list = [] - gidx_list = [] - loss_list = [] - - for i in range(self.num_experts): - W = self.W_list[i] # shape [out_features, in_features] - H = self.H_list[i] - nsamples = self.nsamples_list[i] - - ##### TODO: QUANTIZATION FOR MOE LAYER ##### - - duration = time.time() - t_start - final_scale = torch.cat(scale_list, dim=1) if scale_list else torch.tensor([], device=self.dev) - final_zero = torch.cat(zero_list, dim=1) if zero_list else torch.tensor([], device=self.dev) - final_gidx = torch.cat(gidx_list, dim=0) if gidx_list else torch.tensor([], device=self.dev) - avg_loss = sum(loss_list) / (len(loss_list) + 1e-9) - - return final_scale, final_zero, final_gidx, duration, avg_loss + g_idx = [i // group_size for i in range(self.columns)] + g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device) + if actorder: + Q = Q[:, invperm] + g_idx = g_idx[invperm] + + if isinstance(self.layer, transformers.Conv1D): + Q = Q.t() + self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as( + self.layer.weight.data + ) + if os.environ.get("DEBUG"): + logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + + if scale == []: + scale.append(self.quantizer.scale) + zero.append(self.quantizer.zero) + scale = torch.cat(scale, dim=1) + zero = torch.cat(zero, dim=1) + return scale, zero, g_idx, duration, avg_loss def free(self): if os.environ.get("DEBUG"): @@ -309,4 +216,4 @@ def free(self): torch.cuda.empty_cache() -__all__ = ["GPTQ"] +__all__ = ["GPTQ"] \ No newline at end of file diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py index 39715177..d7c67bab 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py @@ -30,6 +30,7 @@ import threadpoolctl as tctl import torch import torch.nn as nn +import torch.nn.functional as F import transformers import transformers.models.granitemoe.modeling_granitemoe as MOE @@ -53,6 +54,24 @@ logger.addHandler(handler) logger.setLevel(logging.INFO) +class ThreeDTensorModuleList(nn.ModuleList): + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + # Shape of input: (num_selected_experts * batch_size (expert_size), input_features_size) + expert_size = len(self) + input_list = inputs.split(expert_size, dim=0) + output_list = [] + + # Iterate over the number of selected experts and apply each expert to the corresponding input + for i in range(len(self)): + # Shape of input_list[i]: (batch_size, input_features_size); Shape of self[i]: (output_features_size, input_features_size) + # Shape of output: (batch_size, output_features_size); + expert_output = F.linear(input_list[i], self[i]) + output_list.append(expert_output) + + # Concatenate the outputs along the first dimension + results = torch.cat(output_list, dim=0) # Shape: (num_selected_experts * batch_size, output_features_size) + return results + def recurse_getattr(obj, attr: str): """ @@ -101,22 +120,40 @@ def nested_move_to(v, device): return v +def check3DTensor(module, name, convert3dToModuleList=["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"]): + if convert3dToModuleList and name in convert3dToModuleList: + # print("INSIDE check3DTensor module, name, convert3dToModuleList", module, name, convert3dToModuleList) + num_experts = module.num_experts + input_size = module.input_size + output_size = module.output_size + module = ThreeDTensorModuleList([ + nn.Linear(input_size, output_size, bias=False) for _ in range(num_experts) + ]) + + return module + + def find_layers(module, layers=None, name=""): # print("1- INSIDE find_layers module", module) + module = check3DTensor(module, name) + # print("2- AFTER check3DTensor module", module) if not layers: - # Can add MOE.GraniteMoeRMSNorm here if want to include Linear Norm layer ["input_layernorm", "post_attention_layernorm"] - # MOE.GraniteMoeParallelExperts is torch.nn.Module for layer ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"] - layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear, MOE.GraniteMoeParallelExperts] + layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear] - # print("2- LAYERS, type(module), name", layers, type(module), name) + # print("2- INFO: type(module), name", type(module), name) # if hasattr(module, "weight"): - # print("3- module.weight, type(module.weight), module.weight.shape, module.weight.ndim", module.weight, type(module.weight), module.weight.shape, module.weight.ndim) + # print("3- type(module.weight), module.weight.shape, module.weight.ndim", type(module.weight), module.weight.shape, module.weight.ndim) for layer in layers: if isinstance(module, layer): return {name: module} res = {} + # if isinstance(module, MOE.GraniteMoeParallelExperts): + # print("Print GraniteMoeParallelExperts Layer children") + # for name1, child in module.named_children(): + # print("4- name1, child", name1, child) for name1, child in module.named_children(): + # print("PROCESS- name, name1, child", name, name1, child) res.update( find_layers( child, layers=layers, name=name + "." + name1 if name != "" else name1 From b0f92720b0ffcebdfb5f7282050d5fa00685bc45 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Thu, 30 Jan 2025 00:31:26 -0500 Subject: [PATCH 10/13] Incremental change of ModuleList Signed-off-by: Abhishek --- .../src/fms_acceleration_peft/gptqmodel/quantization/gptq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py index c96a6d79..470ed3fb 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py @@ -216,4 +216,4 @@ def free(self): torch.cuda.empty_cache() -__all__ = ["GPTQ"] \ No newline at end of file +__all__ = ["GPTQ"] From 58d27227d67d8d99483703ed1153223e6c354371 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Thu, 30 Jan 2025 20:36:03 -0500 Subject: [PATCH 11/13] Suggested changes Signed-off-by: Abhishek --- .../gptqmodel/models/_const.py | 2 +- .../gptqmodel/models/auto.py | 2 +- .../gptqmodel/models/base.py | 64 +++++++++++++++++ .../gptqmodel/models/granitemoe.py | 5 +- .../gptqmodel/utils/model.py | 69 ++++++------------- 5 files changed, 91 insertions(+), 51 deletions(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py index 14e10054..087dd034 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py @@ -28,7 +28,7 @@ "granite", "gemma", "dbrx_converted", - "granitemoe" + "granitemoe", ] EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048 diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py index 0b1be8fc..d0caf2a4 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py @@ -44,7 +44,7 @@ "granite": GraniteGPTQ, "dbrx": DbrxGPTQ, "dbrx_converted": DbrxConvertedGPTQ, - "granitemoe": GraniteMoeGPTQ + "granitemoe": GraniteMoeGPTQ, } at_least_one_cuda_v6 = any( diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py index b3921ff2..1dc07d86 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py @@ -41,6 +41,7 @@ import accelerate import torch import torch.nn as nn +import torch.nn.functional as F import transformers # Local @@ -61,6 +62,7 @@ convert_gptq_v1_to_v2_format, convert_gptq_v2_to_v1_format, find_layers, + get_all_modules_by_name_suffix, get_checkpoints, get_device, get_module_by_name_prefix, @@ -91,6 +93,9 @@ class BaseGPTQModel(nn.Module): # does not include the node which holds all the repeating layers base_modules: List[str] = None + # 3D Module to be converted to ModuleList + convert_3d_modulelist: List[str] = None + # name of lm_head lm_head: str = "lm_head" @@ -223,6 +228,22 @@ def quantize( if len(calibration_dataset) == 0: raise ValueError("Calibration dataset must not be empty.") + ##### SWAP 3D MODULES TO MODULELIST ##### + if self.convert_3d_modulelist: + for name in self.convert_3d_modulelist: + matches = get_all_modules_by_name_suffix(self.model, name) + for parent, module, full_name in matches: + + # Modify the matched module + if parent is not None: + new_module = self.swap_3d_tensors(module) + + # Replace the old module with the new one + # Derive the child attribute name from the tail of full_name + child_name = full_name.split(".")[-1] + + setattr(parent, child_name, new_module) + min_calibration_dataset_size = 256 min_calibration_dataset_input_ids_avg_length = 256 @@ -1211,5 +1232,48 @@ def __getattr__(self, item): except Exception: return getattr(self.model, item) + def swap_3d_tensors(self, module: nn.Module) -> nn.ModuleList: + """Swap 3D Parameters to ModuleList of 3D Parameters.""" + + num_experts = module.num_experts + input_size = module.input_size + output_size = module.output_size + module = MoE3DModuleList( + [nn.Linear(input_size, output_size, bias=False) for _ in range(num_experts)] + ) + return module + + +class MoE3DModuleList(nn.ModuleList): + def forward(self, inputs: torch.Tensor, expert_size: int) -> torch.Tensor: + """ + Forward pass of the MoE3DModuleList module. + Args: + inputs (Tensor): + Input tensor. + expert_size: + Expert size information. + Returns: + Tensor: Output tensor. + """ + input_list = inputs.split(expert_size, dim=0) + output_list = [] + + # Iterate over the number of selected experts and apply each expert to the corresponding input + for i in range(len(expert_size)): + # Extract weight and bias from the Linear module + weight = self[i].weight.to(device=inputs.device, dtype=inputs.dtype) + bias = ( + self[i].bias.to(device=inputs.device, dtype=inputs.dtype) + if self[i].bias is not None + else None + ) + expert_output = F.linear(input_list[i], weight, bias) + output_list.append(expert_output) + + # Concatenate the outputs along the first dimension + results = torch.cat(output_list, dim=0) + return results + __all__ = ["BaseGPTQModel"] diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py index 22c21773..1f360d27 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py @@ -19,7 +19,10 @@ class GraniteMoeGPTQ(BaseGPTQModel): base_modules = ["model.embed_tokens", "model.norm"] - convert3dToModuleList = ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"] + convert_3d_modulelist = [ + "block_sparse_moe.input_linear", + "block_sparse_moe.output_linear", + ] layers_node = "model.layers" layer_type = "GraniteMoeDecoderLayer" diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py index d7c67bab..53f01843 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py @@ -15,7 +15,7 @@ ############################################################################### # Standard from logging import getLogger -from typing import List, Optional, Union +from typing import List, Optional, Tuple, Union import functools import hashlib import json @@ -30,9 +30,7 @@ import threadpoolctl as tctl import torch import torch.nn as nn -import torch.nn.functional as F import transformers -import transformers.models.granitemoe.modeling_granitemoe as MOE # Local from ..models._const import ( @@ -54,24 +52,6 @@ logger.addHandler(handler) logger.setLevel(logging.INFO) -class ThreeDTensorModuleList(nn.ModuleList): - def forward(self, inputs: torch.Tensor) -> torch.Tensor: - # Shape of input: (num_selected_experts * batch_size (expert_size), input_features_size) - expert_size = len(self) - input_list = inputs.split(expert_size, dim=0) - output_list = [] - - # Iterate over the number of selected experts and apply each expert to the corresponding input - for i in range(len(self)): - # Shape of input_list[i]: (batch_size, input_features_size); Shape of self[i]: (output_features_size, input_features_size) - # Shape of output: (batch_size, output_features_size); - expert_output = F.linear(input_list[i], self[i]) - output_list.append(expert_output) - - # Concatenate the outputs along the first dimension - results = torch.cat(output_list, dim=0) # Shape: (num_selected_experts * batch_size, output_features_size) - return results - def recurse_getattr(obj, attr: str): """ @@ -120,40 +100,14 @@ def nested_move_to(v, device): return v -def check3DTensor(module, name, convert3dToModuleList=["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"]): - if convert3dToModuleList and name in convert3dToModuleList: - # print("INSIDE check3DTensor module, name, convert3dToModuleList", module, name, convert3dToModuleList) - num_experts = module.num_experts - input_size = module.input_size - output_size = module.output_size - module = ThreeDTensorModuleList([ - nn.Linear(input_size, output_size, bias=False) for _ in range(num_experts) - ]) - - return module - - def find_layers(module, layers=None, name=""): - # print("1- INSIDE find_layers module", module) - module = check3DTensor(module, name) - # print("2- AFTER check3DTensor module", module) if not layers: - layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear] - - # print("2- INFO: type(module), name", type(module), name) - # if hasattr(module, "weight"): - # print("3- type(module.weight), module.weight.shape, module.weight.ndim", type(module.weight), module.weight.shape, module.weight.ndim) + layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear] for layer in layers: if isinstance(module, layer): return {name: module} - res = {} - # if isinstance(module, MOE.GraniteMoeParallelExperts): - # print("Print GraniteMoeParallelExperts Layer children") - # for name1, child in module.named_children(): - # print("4- name1, child", name1, child) for name1, child in module.named_children(): - # print("PROCESS- name, name1, child", name, name1, child) res.update( find_layers( child, layers=layers, name=name + "." + name1 if name != "" else name1 @@ -174,6 +128,25 @@ def get_module_by_name_suffix(model, module_name: str): return module +def get_all_modules_by_name_suffix( + model: nn.Module, target_suffix: str +) -> List[Tuple[Optional[nn.Module], nn.Module, str]]: + """Find all modules in the model whose names end with the given suffix, along with their parent modules.""" + name_to_module = dict(model.named_modules()) + results = [] + for full_name, mod in name_to_module.items(): + if full_name.endswith(target_suffix): + split_name = full_name.split(".") + if len(split_name) > 1: + parent_name = ".".join(split_name[:-1]) + else: + parent_name = "" + + parent_module = name_to_module.get(parent_name, None) + results.append((parent_module, mod, full_name)) + return results + + def make_quant( module, names, From 74d18b7fd824eef312a81a771223851c6639c702 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Tue, 18 Feb 2025 19:23:43 -0500 Subject: [PATCH 12/13] Test Fabian's code Signed-off-by: Abhishek --- .../gptqmodel/models/base.py | 79 ++++--------------- .../gptqmodel/models/granitemoe.py | 26 +++++- .../gptqmodel/utils/model.py | 51 +++++++----- 3 files changed, 69 insertions(+), 87 deletions(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py index 29a74b47..1863c4d0 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py @@ -15,7 +15,8 @@ ############################################################################### # Standard from os.path import isfile, join -from typing import Dict, List, Optional, Union +from types import MethodType +from typing import Callable, Dict, List, Optional, Tuple, Union import copy import json import logging @@ -44,7 +45,6 @@ import accelerate import torch import torch.nn as nn -import torch.nn.functional as F import transformers # Local @@ -65,7 +65,6 @@ convert_gptq_v1_to_v2_format, convert_gptq_v2_to_v1_format, find_layers, - get_all_modules_by_name_suffix, get_checkpoints, get_device, get_module_by_name_prefix, @@ -79,6 +78,7 @@ simple_dispatch_model, verify_model_hash, verify_sharded_model_hashes, + replace_3d_parameters_with_module_list, ) from ._const import CPU, CUDA_0, SUPPORTED_MODELS @@ -96,8 +96,11 @@ class BaseGPTQModel(nn.Module): # does not include the node which holds all the repeating layers base_modules: List[str] = None - # 3D Module to be converted to ModuleList - convert_3d_modulelist: List[str] = None + # If 3D Parameters to be converted + convert3dparameters: bool = False + + # User provided forward pass to replace the existing forward pass + update_forwards: List[Tuple[str, Callable]] = None # name of lm_head lm_head: str = "lm_head" @@ -133,6 +136,13 @@ def __init__( super().__init__() self.model = model + if self.convert3dparameters: + model = replace_3d_parameters_with_module_list(model) + for mod in model.modules(): + forward = self.update_forwards.get(mod.__class__.__name__) + if forward is not None: + mod.forward = MethodType(forward, mod) + self.model_type = self.model.config.model_type self._quantized = quantized self.quantize_config = quantize_config @@ -231,22 +241,6 @@ def quantize( if len(calibration_dataset) == 0: raise ValueError("Calibration dataset must not be empty.") - ##### SWAP 3D MODULES TO MODULELIST ##### - if self.convert_3d_modulelist: - for name in self.convert_3d_modulelist: - matches = get_all_modules_by_name_suffix(self.model, name) - for parent, module, full_name in matches: - - # Modify the matched module - if parent is not None: - new_module = self.swap_3d_tensors(module) - - # Replace the old module with the new one - # Derive the child attribute name from the tail of full_name - child_name = full_name.split(".")[-1] - - setattr(parent, child_name, new_module) - min_calibration_dataset_size = 256 min_calibration_dataset_input_ids_avg_length = 256 @@ -1335,48 +1329,5 @@ def __getattr__(self, item): except Exception: return getattr(self.model, item) - def swap_3d_tensors(self, module: nn.Module) -> nn.ModuleList: - """Swap 3D Parameters to ModuleList of 3D Parameters.""" - - num_experts = module.num_experts - input_size = module.input_size - output_size = module.output_size - module = MoE3DModuleList( - [nn.Linear(input_size, output_size, bias=False) for _ in range(num_experts)] - ) - return module - - -class MoE3DModuleList(nn.ModuleList): - def forward(self, inputs: torch.Tensor, expert_size: int) -> torch.Tensor: - """ - Forward pass of the MoE3DModuleList module. - Args: - inputs (Tensor): - Input tensor. - expert_size: - Expert size information. - Returns: - Tensor: Output tensor. - """ - input_list = inputs.split(expert_size, dim=0) - output_list = [] - - # Iterate over the number of selected experts and apply each expert to the corresponding input - for i in range(len(expert_size)): - # Extract weight and bias from the Linear module - weight = self[i].weight.to(device=inputs.device, dtype=inputs.dtype) - bias = ( - self[i].bias.to(device=inputs.device, dtype=inputs.dtype) - if self[i].bias is not None - else None - ) - expert_output = F.linear(input_list[i], weight, bias) - output_list.append(expert_output) - - # Concatenate the outputs along the first dimension - results = torch.cat(output_list, dim=0) - return results - __all__ = ["BaseGPTQModel"] diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py index 1f360d27..451a1a8e 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py @@ -15,13 +15,33 @@ ############################################################################### # Local from .base import BaseGPTQModel +import torch + +def new_forward(self, inputs, expert_size): + """ + Forward pass of the GraniteMoeParallelExperts module. + Args: + inputs (Tensor): + Input tensor. + expert_size: + Expert size information. + Returns: + Tensor: Output tensor. + """ + input_list = inputs.split(expert_size, dim=0) + output_list = [] + for i in range(self.num_experts): + # the key is we need to use call the module + output_list.append(self.weight[i](input_list[i])) + results = torch.cat(output_list, dim=0) + return results class GraniteMoeGPTQ(BaseGPTQModel): base_modules = ["model.embed_tokens", "model.norm"] - convert_3d_modulelist = [ - "block_sparse_moe.input_linear", - "block_sparse_moe.output_linear", + convert3dparameters = True + update_forwards = [ + ("GraniteMoeParallelExperts", new_forward) ] layers_node = "model.layers" diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py index 53f01843..2c75eef5 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py @@ -15,7 +15,7 @@ ############################################################################### # Standard from logging import getLogger -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Union import functools import hashlib import json @@ -128,25 +128,6 @@ def get_module_by_name_suffix(model, module_name: str): return module -def get_all_modules_by_name_suffix( - model: nn.Module, target_suffix: str -) -> List[Tuple[Optional[nn.Module], nn.Module, str]]: - """Find all modules in the model whose names end with the given suffix, along with their parent modules.""" - name_to_module = dict(model.named_modules()) - results = [] - for full_name, mod in name_to_module.items(): - if full_name.endswith(target_suffix): - split_name = full_name.split(".") - if len(split_name) > 1: - parent_name = ".".join(split_name[:-1]) - else: - parent_name = "" - - parent_module = name_to_module.get(parent_name, None) - results.append((parent_module, mod, full_name)) - return results - - def make_quant( module, names, @@ -734,3 +715,33 @@ def get_moe_layer_modules(layer_modules: List, num_experts: int) -> List: new_inside_layer_modules[-1].append(n) return new_inside_layer_modules + + +def replace_3d_parameters_with_module_list( + model: torch.nn.Module, +): + + for name, module in model.named_modules(): + for param_name, param in module.named_parameters(recurse=False): + if len(param.shape) == 3: + device = param.device + dtype = param.dtype + num, in_features, out_features = param.shape + + module_list = [] + for i in range(num): + linear = torch.nn.Linear( + in_features=in_features, + out_features=out_features, + device=device, + dtype=dtype, + bias=None, # FIXME: how to support bias? + ) + linear.weight.data = param.data[i] + module_list.append(linear) + + module_list = torch.nn.ModuleList(module_list) + + # replace + delattr(module, param_name) + setattr(module, param_name, module_list) \ No newline at end of file From f98e85dadfc4c6d6f61c4f18aedc9ea1c165ac5d Mon Sep 17 00:00:00 2001 From: Abhishek Date: Tue, 18 Feb 2025 20:57:40 -0500 Subject: [PATCH 13/13] Test Fabian's code Signed-off-by: Abhishek --- .../fms_acceleration_peft/gptqmodel/models/base.py | 6 +++--- .../gptqmodel/models/granitemoe.py | 13 +++++++------ .../fms_acceleration_peft/gptqmodel/utils/model.py | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py index 1863c4d0..3ee3f4fb 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py @@ -75,10 +75,10 @@ move_to, nested_move_to, pack_model, + replace_3d_parameters_with_module_list, simple_dispatch_model, verify_model_hash, verify_sharded_model_hashes, - replace_3d_parameters_with_module_list, ) from ._const import CPU, CUDA_0, SUPPORTED_MODELS @@ -99,7 +99,7 @@ class BaseGPTQModel(nn.Module): # If 3D Parameters to be converted convert3dparameters: bool = False - # User provided forward pass to replace the existing forward pass + # User provided forward pass to replace the existing forward pass update_forwards: List[Tuple[str, Callable]] = None # name of lm_head @@ -137,7 +137,7 @@ def __init__( self.model = model if self.convert3dparameters: - model = replace_3d_parameters_with_module_list(model) + replace_3d_parameters_with_module_list(model) for mod in model.modules(): forward = self.update_forwards.get(mod.__class__.__name__) if forward is not None: diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py index 451a1a8e..87eb3103 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py @@ -13,9 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Third Party +import torch + # Local from .base import BaseGPTQModel -import torch + def new_forward(self, inputs, expert_size): """ @@ -40,15 +43,13 @@ def new_forward(self, inputs, expert_size): class GraniteMoeGPTQ(BaseGPTQModel): base_modules = ["model.embed_tokens", "model.norm"] convert3dparameters = True - update_forwards = [ - ("GraniteMoeParallelExperts", new_forward) - ] + update_forwards = {"GraniteMoeParallelExperts": new_forward} layers_node = "model.layers" layer_type = "GraniteMoeDecoderLayer" layer_modules = [ ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], ["self_attn.o_proj"], - [f"block_sparse_moe.input_linear.{i}" for i in range(40)], - [f"block_sparse_moe.output_linear.{i}" for i in range(40)], + [f"block_sparse_moe.input_linear.weight.{i}" for i in range(40)], + [f"block_sparse_moe.output_linear.weight.{i}" for i in range(40)], ] diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py index 2c75eef5..e6b0c289 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py @@ -735,7 +735,7 @@ def replace_3d_parameters_with_module_list( out_features=out_features, device=device, dtype=dtype, - bias=None, # FIXME: how to support bias? + bias=None, # FIXME: how to support bias? ) linear.weight.data = param.data[i] module_list.append(linear) @@ -744,4 +744,4 @@ def replace_3d_parameters_with_module_list( # replace delattr(module, param_name) - setattr(module, param_name, module_list) \ No newline at end of file + setattr(module, param_name, module_list)