From cba3127c707e7432913b619297047f1e8c02e423 Mon Sep 17 00:00:00 2001
From: Abhishek <maurya.abhishek@ibm.com>
Date: Tue, 7 Jan 2025 16:23:07 -0500
Subject: [PATCH 01/13] test adddition of granite moe

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
---
 .../gptqmodel/models/__init__.py              |  1 +
 .../gptqmodel/models/_const.py                |  1 +
 .../gptqmodel/models/auto.py                  |  2 +
 .../gptqmodel/models/granitemoe.py            | 49 +++++++++++++++++++
 4 files changed, 53 insertions(+)
 create mode 100644 plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py
index 72383c1d..6c0e3c65 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py
@@ -22,6 +22,7 @@
 from .gpt_bigcode import GPTBigCodeGPTQ
 from .gpt_neox import GPTNeoXGPTQ
 from .granite import GraniteGPTQ
+from .granitemoe import GraniteMoeGPTQ
 from .llama import LlamaGPTQ
 from .mistral import MistralGPTQ
 from .mixtral import MixtralGPTQ
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py
index 23c4baa3..14e10054 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py
@@ -28,6 +28,7 @@
     "granite",
     "gemma",
     "dbrx_converted",
+    "granitemoe"
 ]
 
 EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py
index 23a61a87..0b1be8fc 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py
@@ -29,6 +29,7 @@
 from .gpt_bigcode import GPTBigCodeGPTQ
 from .gpt_neox import GPTNeoXGPTQ
 from .granite import GraniteGPTQ
+from .granitemoe import GraniteMoeGPTQ
 from .llama import LlamaGPTQ
 from .mistral import MistralGPTQ
 from .mixtral import MixtralGPTQ
@@ -43,6 +44,7 @@
     "granite": GraniteGPTQ,
     "dbrx": DbrxGPTQ,
     "dbrx_converted": DbrxConvertedGPTQ,
+    "granitemoe": GraniteMoeGPTQ
 }
 
 at_least_one_cuda_v6 = any(
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
new file mode 100644
index 00000000..fe101295
--- /dev/null
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
@@ -0,0 +1,49 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+# Local
+from .base import BaseGPTQModel
+
+
+class GraniteMoeGPTQ(BaseGPTQModel):
+    base_modules = ["model.embed_tokens", "model.norm"]
+
+    layers_node = "model.layers"
+    layer_type = "GraniteMoeDecoderLayer"
+    layer_modules = [
+        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+        ["self_attn.o_proj"],
+        [
+            "block_sparse_moe.input_linear.experts.0",
+            "block_sparse_moe.input_linear.experts.1",
+            "block_sparse_moe.input_linear.experts.2",
+            "block_sparse_moe.input_linear.experts.3",
+            "block_sparse_moe.input_linear.experts.4",
+            "block_sparse_moe.input_linear.experts.5",
+            "block_sparse_moe.input_linear.experts.6",
+            "block_sparse_moe.input_linear.experts.7",
+        ],
+        [
+            "block_sparse_moe.output_linear.experts.0",
+            "block_sparse_moe.output_linear.experts.1",
+            "block_sparse_moe.output_linear.experts.2",
+            "block_sparse_moe.output_linear.experts.3",
+            "block_sparse_moe.output_linear.experts.4",
+            "block_sparse_moe.output_linear.experts.5",
+            "block_sparse_moe.output_linear.experts.6",
+            "block_sparse_moe.output_linear.experts.7",
+        ],
+        ["block_sparse_moe.router.layer"],
+    ]

From a103f5d55190b16da3d74a40a2a886a7c8f79212 Mon Sep 17 00:00:00 2001
From: Abhishek <maurya.abhishek@ibm.com>
Date: Tue, 7 Jan 2025 16:25:20 -0500
Subject: [PATCH 02/13] test adddition of granite moe

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
---
 .../src/fms_acceleration_peft/gptqmodel/models/base.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
index 85c17ee8..b3921ff2 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
@@ -558,7 +558,7 @@ def save_quantized(
         self.quantize_config.meta_set_versionable(
             key=META_FIELD_QUANTIZER,
             value=META_QUANTIZER_GPTQMODEL,
-            version=__version__,
+            version="1.0.0",
         )
 
         # The config, quantize_config and model may be edited in place in save_quantized.

From 3fe665ec61fb2696edf07bfa0c236b6d759b6e3e Mon Sep 17 00:00:00 2001
From: Abhishek <maurya.abhishek@ibm.com>
Date: Tue, 7 Jan 2025 18:57:26 -0500
Subject: [PATCH 03/13] test adddition of granite moe

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
---
 .../gptqmodel/models/granitemoe.py             | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
index fe101295..6423163e 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
@@ -26,24 +26,10 @@ class GraniteMoeGPTQ(BaseGPTQModel):
         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
         ["self_attn.o_proj"],
         [
-            "block_sparse_moe.input_linear.experts.0",
-            "block_sparse_moe.input_linear.experts.1",
-            "block_sparse_moe.input_linear.experts.2",
-            "block_sparse_moe.input_linear.experts.3",
-            "block_sparse_moe.input_linear.experts.4",
-            "block_sparse_moe.input_linear.experts.5",
-            "block_sparse_moe.input_linear.experts.6",
-            "block_sparse_moe.input_linear.experts.7",
+            "block_sparse_moe.input_linear",
         ],
         [
-            "block_sparse_moe.output_linear.experts.0",
-            "block_sparse_moe.output_linear.experts.1",
-            "block_sparse_moe.output_linear.experts.2",
-            "block_sparse_moe.output_linear.experts.3",
-            "block_sparse_moe.output_linear.experts.4",
-            "block_sparse_moe.output_linear.experts.5",
-            "block_sparse_moe.output_linear.experts.6",
-            "block_sparse_moe.output_linear.experts.7",
+            "block_sparse_moe.output_linear",
         ],
         ["block_sparse_moe.router.layer"],
     ]

From 775e9fb5e2b19c227c1df903f3632cb9e8d07c4d Mon Sep 17 00:00:00 2001
From: Abhishek <maurya.abhishek@ibm.com>
Date: Tue, 7 Jan 2025 20:15:27 -0500
Subject: [PATCH 04/13] test adddition of granite moe

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
---
 .../fms_acceleration_peft/gptqmodel/models/granitemoe.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
index 6423163e..a15318c3 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
@@ -25,11 +25,6 @@ class GraniteMoeGPTQ(BaseGPTQModel):
     layer_modules = [
         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
         ["self_attn.o_proj"],
-        [
-            "block_sparse_moe.input_linear",
-        ],
-        [
-            "block_sparse_moe.output_linear",
-        ],
-        ["block_sparse_moe.router.layer"],
+        ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear", "block_sparse_moe.router.layer"],
+        ["input_layernorm", "post_attention_layernorm"]
     ]

From 56dfbefccaff023ca9626002a8d61482a437e2f6 Mon Sep 17 00:00:00 2001
From: Abhishek <maurya.abhishek@ibm.com>
Date: Tue, 7 Jan 2025 20:55:21 -0500
Subject: [PATCH 05/13] test adddition of granite moe

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
---
 .../src/fms_acceleration_peft/gptqmodel/models/granitemoe.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
index a15318c3..6552376a 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
@@ -25,6 +25,6 @@ class GraniteMoeGPTQ(BaseGPTQModel):
     layer_modules = [
         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
         ["self_attn.o_proj"],
-        ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear", "block_sparse_moe.router.layer"],
+        ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"],
         ["input_layernorm", "post_attention_layernorm"]
     ]

From c23816b2446aeb83c98ffa6e1cc65fb95b2ff2c6 Mon Sep 17 00:00:00 2001
From: Abhishek <maurya.abhishek@ibm.com>
Date: Thu, 23 Jan 2025 23:23:41 -0500
Subject: [PATCH 06/13] Merge and test

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
---
 .../framework_plugin_scattermoe.py            | 35 ++++++++++---------
 plugins/accelerated-peft/pyproject.toml       |  2 +-
 .../framework_plugin_autogptq.py              |  2 +-
 .../framework_plugin_bnb.py                   |  2 +-
 .../gptqmodel/quantization/gptq.py            | 22 ++++++++++++
 .../gptqmodel/utils/model.py                  |  6 ++++
 .../tests/test_peft_plugins.py                |  8 ++---
 .../pyproject.toml                            |  2 +-
 .../framework_plugin_multipack.py             |  2 +-
 .../framework_plugin_padding_free.py          |  2 +-
 plugins/framework/README.md                   |  2 +-
 plugins/framework/pyproject.toml              |  2 +-
 .../src/fms_acceleration/framework.py         |  8 ++---
 .../src/fms_acceleration/framework_plugin.py  |  2 +-
 .../src/fms_acceleration/utils/test_utils.py  | 10 +++---
 plugins/framework/tests/test_framework.py     | 34 +++++++++---------
 plugins/fused-ops-and-kernels/pyproject.toml  |  2 +-
 .../framework_plugin_fast_kernels.py          |  2 +-
 .../tests/test_foak_plugins.py                |  2 +-
 19 files changed, 88 insertions(+), 59 deletions(-)

diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py
index 148a5488..528693ea 100644
--- a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py
+++ b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 
 # Standard
-from typing import Dict
+from typing import Dict, Tuple
 
 # Third Party
 from fms_acceleration import AccelerationPlugin
-from transformers import AutoModelForCausalLM
+from peft import LoraConfig
+from transformers import TrainingArguments
 import torch
 
 # Local
@@ -52,21 +53,27 @@ def __init__(self, configurations: Dict[str, Dict]):
         )
 
     @property
-    def requires_custom_loading(self):
+    def requires_augmentation(self):
         return True
 
-    def model_loader(self, model_name: str, **kwargs):
-
-        # load the model
-        model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
-
+    def augmentation(
+        self,
+        model,
+        train_args: TrainingArguments,
+        modifiable_args: Tuple[LoraConfig],
+    ):
         rank, world_size = 0, 1
         if torch.distributed.is_initialized():
             world_size = torch.distributed.get_world_size()
             rank = torch.distributed.get_rank()
 
-        # shard the MOE, and store the component names, eventually needed
-        # to configure the FSDP
+        if not hasattr(model.config, "name_or_path") or not model.config.name_or_path:
+            raise ValueError(
+                "The model configuration is missing the 'name_or_path' attribute."
+            )
+
+        model_name = model.config.name_or_path
+
         self._moe_component_module_names = prepare_scattermoe(
             model,
             checkpoint_name_or_path=model_name,
@@ -75,13 +82,7 @@ def model_loader(self, model_name: str, **kwargs):
             ep_degree=self._ep_degree,
             mixed_precision=False,  # Currently this is hardcoded to OFF
         )
-
-        # NOTE: there is currently no good way to get the mixed precision
-        # flag from train_args. It will be better to handle this if
-        # when we move the sharding to augmentation.
-        # https://github.com/foundation-model-stack/fms-acceleration/issues/103
-
-        return model
+        return model, modifiable_args
 
     def get_callbacks_and_ready_for_train(
         self, model: torch.nn.Module = None, accelerator=None
diff --git a/plugins/accelerated-peft/pyproject.toml b/plugins/accelerated-peft/pyproject.toml
index e6e545c7..a7dc4464 100644
--- a/plugins/accelerated-peft/pyproject.toml
+++ b/plugins/accelerated-peft/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "fms-acceleration-peft"
-version = '0.3.5.dev'
+version = '0.4.0.dev'
 description = "FMS Acceleration for PeFT"
 authors = [
   {name = "Fabian Lim", email = "flim@sg.ibm.com"},
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
index 41ea2d6f..99568a94 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
@@ -241,7 +241,7 @@ def requires_custom_loading(self):
         return True
 
     @property
-    def requires_agumentation(self):
+    def requires_augmentation(self):
         return True
 
     def augmentation(
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py
index b7202add..1900002d 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py
@@ -185,7 +185,7 @@ def requires_custom_loading(self):
         return True
 
     @property
-    def requires_agumentation(self):
+    def requires_augmentation(self):
         # will skip the augmentation if _no_peft_model == True
         return not self._no_peft_model
 
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
index 470ed3fb..70e1d505 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
@@ -30,6 +30,19 @@ def __init__(self, layer):
             W = W.flatten(1)
         if isinstance(self.layer, transformers.pytorch_utils.Conv1D):
             W = W.t()
+
+        # Suppose your layer weight is [num_experts, out_features, in_features].
+        original_shape = layer.weight.shape  # e.g. (num_experts, out_features, in_features)
+
+        if len(original_shape) == 3:
+            # Flatten to 2D so GPTQ can treat it as rows × cols
+            # rows = num_experts * out_features, cols = in_features
+            W = W.reshape(original_shape[0] * original_shape[1], original_shape[2])
+            self._is_3d = True
+            self._original_shape = original_shape
+        else:
+            self._is_3d = False # 2D   
+            
         self.rows = W.shape[0]
         self.columns = W.shape[1]
         self.H = torch.zeros((self.columns, self.columns), device=self.dev)
@@ -196,6 +209,15 @@ def fasterquant(
         self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as(
             self.layer.weight.data
         )
+        
+        # Q is 2D after the Cholesky-based quantization step
+        if self._is_3d:
+            # Reshape Q back to [num_experts, out_features, in_features]
+            Q = Q.reshape(self._original_shape)
+
+        # Now assign it back to the parameter
+        self.layer.weight.data = Q.type_as(self.layer.weight.data)
+
         if os.environ.get("DEBUG"):
             logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
 
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
index d51e0e60..d5bcc15c 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
@@ -106,6 +106,12 @@ def find_layers(module, layers=None, name=""):
     for layer in layers:
         if isinstance(module, layer):
             return {name: module}
+    
+    # ADD FOR module GraniteMoeParallelExperts: https://github.com/huggingface/transformers/blob/b5aaf875090388e2bbdbf2d8641ed7967365f435/src/transformers/models/granitemoe/modeling_granitemoe.py#L258C7-L258C32
+    if hasattr(module, "weight") and isinstance(module.weight, torch.nn.Parameter):
+        if module.weight.ndim == 3:
+            return {name: module}
+    
     res = {}
     for name1, child in module.named_children():
         res.update(
diff --git a/plugins/accelerated-peft/tests/test_peft_plugins.py b/plugins/accelerated-peft/tests/test_peft_plugins.py
index 38534d5d..d36b3ce8 100644
--- a/plugins/accelerated-peft/tests/test_peft_plugins.py
+++ b/plugins/accelerated-peft/tests/test_peft_plugins.py
@@ -54,7 +54,7 @@ def test_configure_gptq_plugin():
 
         # check flags and callbacks
         assert framework.requires_custom_loading
-        assert framework.requires_agumentation
+        assert framework.requires_augmentation
         assert len(framework.get_callbacks_and_ready_for_train()) == 0
 
     # attempt to activate plugin with configuration pointing to wrong path
@@ -171,7 +171,7 @@ def test_configure_bnb_plugin():
 
         # check flags and callbacks
         assert framework.requires_custom_loading
-        assert framework.requires_agumentation
+        assert framework.requires_augmentation
         assert len(framework.get_callbacks_and_ready_for_train()) == 0
 
     # test valid combinatinos
@@ -187,7 +187,7 @@ def test_configure_bnb_plugin():
         ):
             # check flags and callbacks
             assert framework.requires_custom_loading
-            assert framework.requires_agumentation
+            assert framework.requires_augmentation
             assert len(framework.get_callbacks_and_ready_for_train()) == 0
 
     # test no_peft_model is true skips plugin.augmentation
@@ -202,7 +202,7 @@ def test_configure_bnb_plugin():
             require_packages_check=False,
         ):
             # check flags and callbacks
-            assert (not correct_value) == framework.requires_agumentation
+            assert (not correct_value) == framework.requires_augmentation
 
     # attempt to activate plugin with configuration pointing to wrong path
     # - raise with message that no plugins can be configured
diff --git a/plugins/attention-and-distributed-packing/pyproject.toml b/plugins/attention-and-distributed-packing/pyproject.toml
index fdbb3ac1..e755ac56 100644
--- a/plugins/attention-and-distributed-packing/pyproject.toml
+++ b/plugins/attention-and-distributed-packing/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "fms-acceleration-aadp"
-version = '0.1.1.dev'
+version = '0.2.0.dev'
 description = "FMS Acceleration Plugin for Attention and Distributed Packing Optimizations"
 authors = [
   {name = "Fabian Lim", email = "flim@sg.ibm.com"},
diff --git a/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_multipack.py b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_multipack.py
index aa9134a6..391743c6 100644
--- a/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_multipack.py
+++ b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_multipack.py
@@ -61,7 +61,7 @@ def __init__(
             assert self._pad_token_id is not None, "need to get pad token id"
 
     @property
-    def requires_agumentation(self):
+    def requires_augmentation(self):
         return True
 
     def augmentation(
diff --git a/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_padding_free.py b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_padding_free.py
index 0e4e5ef9..596b5600 100644
--- a/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_padding_free.py
+++ b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_padding_free.py
@@ -41,7 +41,7 @@ def __init__(self, configurations: Dict[str, Dict]):
         )
 
     @property
-    def requires_agumentation(self):
+    def requires_augmentation(self):
         return True
 
     def augmentation(
diff --git a/plugins/framework/README.md b/plugins/framework/README.md
index 5b3cbfd7..4895f322 100644
--- a/plugins/framework/README.md
+++ b/plugins/framework/README.md
@@ -45,7 +45,7 @@ model, (peft_config,) = framework.augmentation(
 )
 ```
 
-We also provide `framework.requires_agumentation` to check if augumentation is required by the plugins.
+We also provide `framework.requires_augmentation` to check if augumentation is required by the plugins.
 
 Finally pass the model to train:
 
diff --git a/plugins/framework/pyproject.toml b/plugins/framework/pyproject.toml
index c57e5f01..bb481c50 100644
--- a/plugins/framework/pyproject.toml
+++ b/plugins/framework/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "fms-acceleration"
-version = '0.5.0.dev'
+version = '0.6.0.dev'
 description = "FMS Acceleration Plugin Framework"
 authors = [
   {name = "Fabian Lim", email = "flim@sg.ibm.com"},
diff --git a/plugins/framework/src/fms_acceleration/framework.py b/plugins/framework/src/fms_acceleration/framework.py
index 3a393815..75b436c9 100644
--- a/plugins/framework/src/fms_acceleration/framework.py
+++ b/plugins/framework/src/fms_acceleration/framework.py
@@ -199,10 +199,10 @@ def augmentation(
                 x in model_archs for x in plugin.restricted_model_archs
             ):
                 raise ValueError(
-                    f"Model architectures in '{model_archs}' are supported for '{plugin_name}'."
+                    f"Model architectures in '{model_archs}' are not supported for '{plugin_name}'."
                 )
 
-            if plugin.requires_agumentation:
+            if plugin.requires_augmentation:
                 model, modifiable_args = plugin.augmentation(
                     model, train_args, modifiable_args=modifiable_args
                 )
@@ -214,8 +214,8 @@ def requires_custom_loading(self):
         return len(self.plugins_require_custom_loading) > 0
 
     @property
-    def requires_agumentation(self):
-        return any(x.requires_agumentation for _, x in self.active_plugins)
+    def requires_augmentation(self):
+        return any(x.requires_augmentation for _, x in self.active_plugins)
 
     def get_callbacks_and_ready_for_train(
         self, model: torch.nn.Module = None, accelerator: Accelerator = None
diff --git a/plugins/framework/src/fms_acceleration/framework_plugin.py b/plugins/framework/src/fms_acceleration/framework_plugin.py
index 28fecebf..94ea4ffa 100644
--- a/plugins/framework/src/fms_acceleration/framework_plugin.py
+++ b/plugins/framework/src/fms_acceleration/framework_plugin.py
@@ -171,7 +171,7 @@ def requires_custom_loading(self):
         return False
 
     @property
-    def requires_agumentation(self):
+    def requires_augmentation(self):
         return False
 
     def model_loader(self, model_name: str, **kwargs):
diff --git a/plugins/framework/src/fms_acceleration/utils/test_utils.py b/plugins/framework/src/fms_acceleration/utils/test_utils.py
index b1f731d1..6a3bc123 100644
--- a/plugins/framework/src/fms_acceleration/utils/test_utils.py
+++ b/plugins/framework/src/fms_acceleration/utils/test_utils.py
@@ -159,8 +159,8 @@ def create_plugin_cls(
     restricted_models: Set = None,
     require_pkgs: Set = None,
     requires_custom_loading: bool = False,
-    requires_agumentation: bool = False,
-    agumentation: Callable = None,
+    requires_augmentation: bool = False,
+    augmentation: Callable = None,
     model_loader: Callable = None,
 ):
     "helper function to create plugin class"
@@ -174,11 +174,11 @@ def create_plugin_cls(
         "restricted_model_archs": restricted_models,
         "require_packages": require_pkgs,
         "requires_custom_loading": requires_custom_loading,
-        "requires_agumentation": requires_agumentation,
+        "requires_augmentation": requires_augmentation,
     }
 
-    if agumentation is not None:
-        attributes["augmentation"] = agumentation
+    if augmentation is not None:
+        attributes["augmentation"] = augmentation
 
     if model_loader is not None:
         attributes["model_loader"] = model_loader
diff --git a/plugins/framework/tests/test_framework.py b/plugins/framework/tests/test_framework.py
index 4fd43eb2..b3f4eb9e 100644
--- a/plugins/framework/tests/test_framework.py
+++ b/plugins/framework/tests/test_framework.py
@@ -68,7 +68,7 @@ def test_model_with_no_config_raises():
 
     # create model and (incomplete) plugin with requires_augmentation = True
     model_no_config = torch.nn.Module()  # empty model
-    incomplete_plugin = create_plugin_cls(requires_agumentation=True)
+    incomplete_plugin = create_plugin_cls(requires_augmentation=True)
 
     # register and activate 1 incomplete plugin, and:
     # 1. test correct plugin registration and activation.
@@ -104,13 +104,13 @@ def test_single_plugin():
     empty_plugin = create_plugin_cls()
     incomplete_plugin = create_plugin_cls(
         restricted_models={"CausalLM"},
-        requires_agumentation=True,
+        requires_augmentation=True,
     )
     plugin = create_plugin_cls(
         restricted_models={"CausalLM"},
-        requires_agumentation=True,
+        requires_augmentation=True,
         requires_custom_loading=True,
-        agumentation=dummy_augmentation,
+        augmentation=dummy_augmentation,
         model_loader=dummy_custom_loader,
     )
     train_args = None  # dummy for now
@@ -175,32 +175,32 @@ def test_two_plugins():
 
     model = create_noop_model_with_archs(archs=["CausalLM"])
     incomp_plugin1 = create_plugin_cls(
-        restricted_models={"CausalLM"}, requires_agumentation=True
+        restricted_models={"CausalLM"}, requires_augmentation=True
     )
-    incomp_plugin2 = create_plugin_cls(requires_agumentation=True)
+    incomp_plugin2 = create_plugin_cls(requires_augmentation=True)
     incomp_plugin3 = create_plugin_cls(
-        class_name="PluginNoop2", requires_agumentation=True
+        class_name="PluginNoop2", requires_augmentation=True
     )
     plugin1 = create_plugin_cls(
         restricted_models={"CausalLM"},
-        requires_agumentation=True,
+        requires_augmentation=True,
         requires_custom_loading=True,
-        agumentation=dummy_augmentation,
+        augmentation=dummy_augmentation,
         model_loader=dummy_custom_loader,
     )
     plugin2 = create_plugin_cls(
         class_name="PluginNoop2",
         restricted_models={"CausalLM"},
-        requires_agumentation=True,
+        requires_augmentation=True,
         requires_custom_loading=True,
-        agumentation=dummy_augmentation,
+        augmentation=dummy_augmentation,
         model_loader=dummy_custom_loader,
     )
     plugin3_no_loader = create_plugin_cls(
         class_name="PluginNoop2",
         restricted_models={"CausalLM"},
-        requires_agumentation=True,
-        agumentation=dummy_augmentation,
+        requires_augmentation=True,
+        augmentation=dummy_augmentation,
     )
     train_args = None  # dummy for now
 
@@ -299,8 +299,8 @@ def _hook(
     for class_name in ["PluginDEF", "PluginABC"]:
         plugin = create_plugin_cls(
             class_name=class_name,
-            requires_agumentation=True,
-            agumentation=hook_builder(act_order=plugin_activation_order),
+            requires_augmentation=True,
+            augmentation=hook_builder(act_order=plugin_activation_order),
         )
         plugins_to_be_installed.append((class_name, plugin))
 
@@ -319,8 +319,8 @@ def test_plugin_registration_combination_logic():
 
     plugin = create_plugin_cls(
         restricted_models={"CausalLM"},
-        requires_agumentation=True,
-        agumentation=dummy_augmentation,
+        requires_augmentation=True,
+        augmentation=dummy_augmentation,
     )
 
     configuration_contents = {"existing1": {"key1": 1}, "existing2": {"key1": 1}}
diff --git a/plugins/fused-ops-and-kernels/pyproject.toml b/plugins/fused-ops-and-kernels/pyproject.toml
index 5a003712..5bb23e18 100644
--- a/plugins/fused-ops-and-kernels/pyproject.toml
+++ b/plugins/fused-ops-and-kernels/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "fms-acceleration-foak"
-version = '0.4.0.dev'
+version = '0.5.0.dev'
 description = "FMS Acceleration using Fused Operations and Kernels"
 authors = [
   {name = "Fabian Lim", email = "flim@sg.ibm.com"},
diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py
index df21fd5c..0d7ce802 100644
--- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py
+++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py
@@ -128,7 +128,7 @@ def __init__(self, configurations: Dict[str, Dict]):
         )
 
     @property
-    def requires_agumentation(self):
+    def requires_augmentation(self):
         return True
 
     def augmentation(
diff --git a/plugins/fused-ops-and-kernels/tests/test_foak_plugins.py b/plugins/fused-ops-and-kernels/tests/test_foak_plugins.py
index 11e91ff6..9d1c0c97 100644
--- a/plugins/fused-ops-and-kernels/tests/test_foak_plugins.py
+++ b/plugins/fused-ops-and-kernels/tests/test_foak_plugins.py
@@ -47,7 +47,7 @@ def test_configure_gptq_foak_plugin():
 
         # check flags and callbacks
         assert framework.requires_custom_loading is False
-        assert framework.requires_agumentation
+        assert framework.requires_augmentation
         assert len(framework.get_callbacks_and_ready_for_train()) == 0
 
     # attempt to activate plugin with configuration pointing to wrong path

From cff9a59c74f4b4a6873f8e77d29d14688f8351ae Mon Sep 17 00:00:00 2001
From: Abhishek <maurya.abhishek@ibm.com>
Date: Mon, 27 Jan 2025 22:29:29 -0500
Subject: [PATCH 07/13] Changes for __init__ and find_layers

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
---
 .../gptqmodel/quantization/gptq.py            | 436 ++++++++++--------
 .../gptqmodel/utils/model.py                  |  17 +-
 2 files changed, 263 insertions(+), 190 deletions(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
index 70e1d505..0ac9e203 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
@@ -11,6 +11,7 @@
 import torch
 import torch.nn as nn
 import transformers
+import transformers.models.granitemoe.modeling_granitemoe as MOE
 
 # Local
 from .quantizer import Quantizer
@@ -25,59 +26,112 @@ class GPTQ:
     def __init__(self, layer):
         self.layer = layer
         self.dev = self.layer.weight.device
-        W = layer.weight.data.clone()
-        if isinstance(self.layer, nn.Conv2d):
-            W = W.flatten(1)
-        if isinstance(self.layer, transformers.pytorch_utils.Conv1D):
-            W = W.t()
-
-        # Suppose your layer weight is [num_experts, out_features, in_features].
-        original_shape = layer.weight.shape  # e.g. (num_experts, out_features, in_features)
-
-        if len(original_shape) == 3:
-            # Flatten to 2D so GPTQ can treat it as rows × cols
-            # rows = num_experts * out_features, cols = in_features
-            W = W.reshape(original_shape[0] * original_shape[1], original_shape[2])
-            self._is_3d = True
-            self._original_shape = original_shape
-        else:
-            self._is_3d = False # 2D   
-            
-        self.rows = W.shape[0]
-        self.columns = W.shape[1]
-        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
-        self.nsamples = 0
         self.quantizer = Quantizer()
+        # print("GPTQ INIT: layer, type(layer)", layer, type(layer))
+        # print("GPTQ INIT: layer.weight.data", layer.weight.data)
+
+
+        if isinstance(layer, MOE.GraniteMoeParallelExperts):
+            self.is_moe = True
+            self.num_experts = layer.num_experts
+            self.out_features = layer.output_size
+            self.in_features = layer.input_size
+            # print("GPTQ INIT: self.num_experts, self.out_features, self.in_features", self.num_experts, self.out_features, self.in_features)
+            
+            # Separate W for each expert
+            self.W_list, self.H_list, self.nsamples_list = [], [], []
+            for i in range(self.num_experts):
+                # Each expert slice is of shape [out_features, in_features]
+                self.W_list.append(layer.weight.data[i].clone())
+
+                # For each expert param, we have a Hessian and sample count
+                self.H_list.append(torch.zeros((self.in_features, self.in_features), device=self.dev))
+                self.nsamples_list.append(0)
+            
+            # print("GPTQ INIT: self.W_list, len(self.W_list), type(self.W_list[0]), self.W_list[0].size(), self.W_list[0].dim()", self.W_list, len(self.W_list), type(self.W_list[0]), self.W_list[0].size(), self.W_list[0].dim())
+            # print("GPTQ INIT: self.H_list, len(self.H_list), type(self.H_list[0]), self.H_list[0].size(), self.H_list[0].dim()", self.H_list, len(self.H_list), type(self.H_list[0]), self.H_list[0].size(), self.H_list[0].dim())
+
+        else:
+            # For 2D layer (linear, conv, etc.), we have a single Hessian and sample count
+            self.is_moe = False
+            W = layer.weight.data.clone()
+            if isinstance(layer, nn.Conv2d):
+                W = W.flatten(1)
+            if isinstance(layer, transformers.pytorch_utils.Conv1D):
+                W = W.t()
+            # print("GPTQ INIT: W, type(W), W.size(), W.dim()", W, type(W), W.size(), W.dim())
+
+            self.rows = W.shape[0]
+            self.columns = W.shape[1]
+            self.H = torch.zeros((self.columns, self.columns), device=self.dev)
+            self.nsamples = 0
+            # print("GPTQ INIT: self.H, type(self.H), self.H.size(), self.H.dim()", self.H, type(self.H), self.H.size(), self.H.dim())
+            # print("GPTQ INIT: self.rows, self.columns", self.rows, self.columns)
+        
 
     def add_batch(self, inp, out):
         if os.environ.get("DEBUG"):
             self.inp1 = inp
             self.out1 = out
-        if len(inp.shape) == 2:
-            inp = inp.unsqueeze(0)
-        tmp = inp.shape[0]
-        if isinstance(self.layer, nn.Linear) or isinstance(
-            self.layer, transformers.Conv1D
-        ):
-            if len(inp.shape) == 3:
-                inp = inp.reshape((-1, inp.shape[-1]))
-            inp = inp.t()
-        if isinstance(self.layer, nn.Conv2d):
-            unfold = nn.Unfold(
-                self.layer.kernel_size,
-                dilation=self.layer.dilation,
-                padding=self.layer.padding,
-                stride=self.layer.stride,
-            )
-            inp = unfold(inp)
-            inp = inp.permute([1, 0, 2])
-            inp = inp.flatten(1)
-        self.H *= self.nsamples / (self.nsamples + tmp)
-        self.nsamples += tmp
-        # inp = inp.float()
-        inp = math.sqrt(2 / self.nsamples) * inp.float()
-        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
-        self.H += inp.matmul(inp.t())
+        # Update entire H_list and nsamples_list
+        if self.is_moe:
+            # print("INSIDE ADD_BATCH FOR MOE: inp, type(inp), inp.shape", inp, type(inp), inp.shape)
+            for expert_idx in range(self.num_experts):    
+                H = self.H_list[expert_idx]
+                nsamples = self.nsamples_list[expert_idx]
+                
+                # if len(inp.shape) == 2:
+                #     inp = inp.unsqueeze(0)
+                # tmp = inp.shape[0]
+                # print("INSIDE ADD_BATCH FOR MOE 2: inp, inp.shape, tmp", inp, inp.shape, tmp)
+
+                # Below is doing reverse of above
+                # if len(inp.shape) == 3:
+                #     inp = inp.reshape((-1, inp.shape[-1]))
+                #     print("INSIDE ADD_BATCH FOR MOE 3: inp, inp.shape, tmp", inp, inp.shape, tmp)
+
+                tmp = 1
+                # len(inp.shape) == 2 in this case
+                mod_inp = inp.t()
+                # print("INSIDE ADD_BATCH FOR MOE 4: inp, inp.shape, tmp", inp, inp.shape, tmp)
+
+                H *= nsamples / (nsamples + tmp)
+                nsamples += tmp
+                mod_inp = math.sqrt(2 / nsamples) * mod_inp.float()
+                H += mod_inp.matmul(mod_inp.t())
+            
+                self.H_list[expert_idx] = H
+                self.nsamples_list[expert_idx] = nsamples
+        else:
+            # print("INSIDE ADD_BATCH FOR 2D")
+            # print("INSIDE ADD_BATCH 1: inp", inp, type(inp), inp.shape)
+            if len(inp.shape) == 2:
+                inp = inp.unsqueeze(0)
+            tmp = inp.shape[0]
+            if isinstance(self.layer, nn.Linear) or isinstance(
+                self.layer, transformers.Conv1D
+            ):
+                if len(inp.shape) == 3:
+                    inp = inp.reshape((-1, inp.shape[-1]))
+                inp = inp.t()
+            if isinstance(self.layer, nn.Conv2d):
+                unfold = nn.Unfold(
+                    self.layer.kernel_size,
+                    dilation=self.layer.dilation,
+                    padding=self.layer.padding,
+                    stride=self.layer.stride,
+                )
+                inp = unfold(inp)
+                inp = inp.permute([1, 0, 2])
+                inp = inp.flatten(1)
+            # print("INSIDE ADD_BATCH 2: BEFORE tmp, self.H, self.nsamples", tmp, self.H, self.nsamples)
+            self.H *= self.nsamples / (self.nsamples + tmp)
+            self.nsamples += tmp
+            # print("INSIDE ADD_BATCH 3: AFTER tmp, self.H, self.nsamples", tmp, self.H, self.nsamples)
+            # inp = inp.float()
+            inp = math.sqrt(2 / self.nsamples) * inp.float()
+            # self.H += 2 / self.nsamples * inp.matmul(inp.t())
+            self.H += inp.matmul(inp.t())
 
     def fasterquant(
         self,
@@ -87,146 +141,162 @@ def fasterquant(
         actorder=False,
         static_groups=False,
     ):
-        W = self.layer.weight.data.clone()
-        if isinstance(self.layer, nn.Conv2d):
-            W = W.flatten(1)
-        if isinstance(self.layer, transformers.Conv1D):
-            W = W.t()
-        W = W.float()
-
-        tick = time.time()
-
-        if not self.quantizer.ready():
-            self.quantizer.find_params(W, weight=True)
-
-        H = self.H
-        del self.H
-        dead = torch.diag(H) == 0
-        H[dead, dead] = 1
-        W[:, dead] = 0
-
-        g_idx = []
-        scale = []
-        zero = []
-        now_idx = 1
-
-        if static_groups:
-            # Standard
-            import copy
-
-            groups = []
-            for i in range(0, self.columns, group_size):
-                quantizer = copy.deepcopy(self.quantizer)
-                quantizer.find_params(W[:, i : (i + group_size)], weight=True)
-                scale.append(quantizer.scale)
-                zero.append(quantizer.zero)
-                groups.append(quantizer)
-
-        if actorder:
-            perm = torch.argsort(torch.diag(H), descending=True)
-            W = W[:, perm]
-            H = H[perm][:, perm]
-            invperm = torch.argsort(perm)
-
-        Losses = torch.zeros_like(W)
-        Q = torch.zeros_like(W)
-
-        damp = percdamp * torch.mean(torch.diag(H))
-        diag = torch.arange(self.columns, device=self.dev)
-        H[diag, diag] += damp
-        H = torch.linalg.cholesky(H)
-        H = torch.cholesky_inverse(H)
-        H = torch.linalg.cholesky(H, upper=True)
-        Hinv = H
-
-        for i1 in range(0, self.columns, blocksize):
-            i2 = min(i1 + blocksize, self.columns)
-            count = i2 - i1
-
-            W1 = W[:, i1:i2].clone()
-            Q1 = torch.zeros_like(W1)
-            Err1 = torch.zeros_like(W1)
-            Losses1 = torch.zeros_like(W1)
-            Hinv1 = Hinv[i1:i2, i1:i2]
-
-            for i in range(count):
-                w = W1[:, i]
-                d = Hinv1[i, i]
-
-                if group_size != -1:
-                    if not static_groups:
-                        if (i1 + i) % group_size == 0:
-                            self.quantizer.find_params(
-                                W[:, (i1 + i) : (i1 + i + group_size)], weight=True
-                            )
-
-                        if ((i1 + i) // group_size) - now_idx == -1:
-                            scale.append(self.quantizer.scale)
-                            zero.append(self.quantizer.zero)
-                            now_idx += 1
-                    else:
-                        idx = i1 + i
-                        if actorder:
-                            idx = perm[idx]
-                        self.quantizer = groups[idx // group_size]
-
-                q = self.quantizer.quantize(w.unsqueeze(1)).flatten()
-                Q1[:, i] = q
-                Losses1[:, i] = (w - q) ** 2 / d**2
-
-                err1 = (w - q) / d
-                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
-                Err1[:, i] = err1
-
-            Q[:, i1:i2] = Q1
-            Losses[:, i1:i2] = Losses1 / 2
-
-            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+        if self.is_moe:
+            # For MoE model
+            # Loop over each expert param and quantize it separately 
+            t_start = time.time()
+            scale_list = []
+            zero_list = []
+            gidx_list = []
+            loss_list = []
+
+            for i in range(self.num_experts):
+                W = self.W_list[i]  # shape [out_features, in_features]
+                H = self.H_list[i]
+                nsamples = self.nsamples_list[i]
+                
+                ##### TODO: QUANTIZATION FOR MOE LAYER #####
+
+            duration = time.time() - t_start
+            final_scale = torch.cat(scale_list, dim=1) if scale_list else torch.tensor([], device=self.dev)
+            final_zero = torch.cat(zero_list, dim=1) if zero_list else torch.tensor([], device=self.dev)
+            final_gidx = torch.cat(gidx_list, dim=0)  if gidx_list else torch.tensor([], device=self.dev)
+            avg_loss = sum(loss_list) / (len(loss_list) + 1e-9)
+
+            return final_scale, final_zero, final_gidx, duration, avg_loss
+        else:
+            W = self.layer.weight.data.clone()
+            if isinstance(self.layer, nn.Conv2d):
+                W = W.flatten(1)
+            if isinstance(self.layer, transformers.Conv1D):
+                W = W.t()
+            W = W.float()
+
+            tick = time.time()
+
+            if not self.quantizer.ready():
+                self.quantizer.find_params(W, weight=True)
+
+            H = self.H
+            del self.H
+            dead = torch.diag(H) == 0
+            H[dead, dead] = 1
+            W[:, dead] = 0
+
+            g_idx = []
+            scale = []
+            zero = []
+            now_idx = 1
+
+            if static_groups:
+                # Standard
+                import copy
+
+                groups = []
+                for i in range(0, self.columns, group_size):
+                    quantizer = copy.deepcopy(self.quantizer)
+                    quantizer.find_params(W[:, i : (i + group_size)], weight=True)
+                    scale.append(quantizer.scale)
+                    zero.append(quantizer.zero)
+                    groups.append(quantizer)
+
+            if actorder:
+                perm = torch.argsort(torch.diag(H), descending=True)
+                W = W[:, perm]
+                H = H[perm][:, perm]
+                invperm = torch.argsort(perm)
+
+            Losses = torch.zeros_like(W)
+            Q = torch.zeros_like(W)
+
+            damp = percdamp * torch.mean(torch.diag(H))
+            diag = torch.arange(self.columns, device=self.dev)
+            H[diag, diag] += damp
+            H = torch.linalg.cholesky(H)
+            H = torch.cholesky_inverse(H)
+            H = torch.linalg.cholesky(H, upper=True)
+            Hinv = H
+
+            for i1 in range(0, self.columns, blocksize):
+                i2 = min(i1 + blocksize, self.columns)
+                count = i2 - i1
+
+                W1 = W[:, i1:i2].clone()
+                Q1 = torch.zeros_like(W1)
+                Err1 = torch.zeros_like(W1)
+                Losses1 = torch.zeros_like(W1)
+                Hinv1 = Hinv[i1:i2, i1:i2]
+
+                for i in range(count):
+                    w = W1[:, i]
+                    d = Hinv1[i, i]
+
+                    if group_size != -1:
+                        if not static_groups:
+                            if (i1 + i) % group_size == 0:
+                                self.quantizer.find_params(
+                                    W[:, (i1 + i) : (i1 + i + group_size)], weight=True
+                                )
+
+                            if ((i1 + i) // group_size) - now_idx == -1:
+                                scale.append(self.quantizer.scale)
+                                zero.append(self.quantizer.zero)
+                                now_idx += 1
+                        else:
+                            idx = i1 + i
+                            if actorder:
+                                idx = perm[idx]
+                            self.quantizer = groups[idx // group_size]
+
+                    q = self.quantizer.quantize(w.unsqueeze(1)).flatten()
+                    Q1[:, i] = q
+                    Losses1[:, i] = (w - q) ** 2 / d**2
+
+                    err1 = (w - q) / d
+                    W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                    Err1[:, i] = err1
+
+                Q[:, i1:i2] = Q1
+                Losses[:, i1:i2] = Losses1 / 2
+
+                W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+
+                if os.environ.get("DEBUG"):
+                    self.layer.weight.data[:, :i2] = Q[:, :i2]
+                    self.layer.weight.data[:, i2:] = W[:, i2:]
+                    logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+                    logger.debug(torch.sum(Losses))
+
+            torch.cuda.synchronize()
+
+            duration = time.time() - tick
+            avg_loss = torch.sum(Losses).item() / self.nsamples
+
+            group_size = group_size if group_size != -1 else self.columns
+            if static_groups and actorder:
+                g_idx = [perm[i] // group_size for i in range(self.columns)]
+            else:
+                g_idx = [i // group_size for i in range(self.columns)]
+            g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
+            if actorder:
+                Q = Q[:, invperm]
+                g_idx = g_idx[invperm]
+
+            if isinstance(self.layer, transformers.Conv1D):
+                Q = Q.t()
+            self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as(
+                self.layer.weight.data
+            )
 
             if os.environ.get("DEBUG"):
-                self.layer.weight.data[:, :i2] = Q[:, :i2]
-                self.layer.weight.data[:, i2:] = W[:, i2:]
                 logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
-                logger.debug(torch.sum(Losses))
-
-        torch.cuda.synchronize()
-
-        duration = time.time() - tick
-        avg_loss = torch.sum(Losses).item() / self.nsamples
 
-        group_size = group_size if group_size != -1 else self.columns
-        if static_groups and actorder:
-            g_idx = [perm[i] // group_size for i in range(self.columns)]
-        else:
-            g_idx = [i // group_size for i in range(self.columns)]
-        g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
-        if actorder:
-            Q = Q[:, invperm]
-            g_idx = g_idx[invperm]
-
-        if isinstance(self.layer, transformers.Conv1D):
-            Q = Q.t()
-        self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as(
-            self.layer.weight.data
-        )
-        
-        # Q is 2D after the Cholesky-based quantization step
-        if self._is_3d:
-            # Reshape Q back to [num_experts, out_features, in_features]
-            Q = Q.reshape(self._original_shape)
-
-        # Now assign it back to the parameter
-        self.layer.weight.data = Q.type_as(self.layer.weight.data)
-
-        if os.environ.get("DEBUG"):
-            logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
-
-        if scale == []:
-            scale.append(self.quantizer.scale)
-            zero.append(self.quantizer.zero)
-        scale = torch.cat(scale, dim=1)
-        zero = torch.cat(zero, dim=1)
-        return scale, zero, g_idx, duration, avg_loss
+            if scale == []:
+                scale.append(self.quantizer.scale)
+                zero.append(self.quantizer.zero)
+            scale = torch.cat(scale, dim=1)
+            zero = torch.cat(zero, dim=1)
+            return scale, zero, g_idx, duration, avg_loss
 
     def free(self):
         if os.environ.get("DEBUG"):
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
index d5bcc15c..39715177 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
@@ -31,6 +31,7 @@
 import torch
 import torch.nn as nn
 import transformers
+import transformers.models.granitemoe.modeling_granitemoe as MOE
 
 # Local
 from ..models._const import (
@@ -101,17 +102,19 @@ def nested_move_to(v, device):
 
 
 def find_layers(module, layers=None, name=""):
+    # print("1- INSIDE find_layers module", module)
     if not layers:
-        layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear]
+        # Can add MOE.GraniteMoeRMSNorm here if want to include Linear Norm layer ["input_layernorm", "post_attention_layernorm"]
+        # MOE.GraniteMoeParallelExperts is torch.nn.Module for layer ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"]
+        layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear, MOE.GraniteMoeParallelExperts] 
+
+    # print("2- LAYERS, type(module), name", layers, type(module), name)
+    # if hasattr(module, "weight"):
+    #     print("3- module.weight, type(module.weight), module.weight.shape, module.weight.ndim", module.weight, type(module.weight), module.weight.shape, module.weight.ndim)
     for layer in layers:
         if isinstance(module, layer):
             return {name: module}
-    
-    # ADD FOR module GraniteMoeParallelExperts: https://github.com/huggingface/transformers/blob/b5aaf875090388e2bbdbf2d8641ed7967365f435/src/transformers/models/granitemoe/modeling_granitemoe.py#L258C7-L258C32
-    if hasattr(module, "weight") and isinstance(module.weight, torch.nn.Parameter):
-        if module.weight.ndim == 3:
-            return {name: module}
-    
+
     res = {}
     for name1, child in module.named_children():
         res.update(

From 3cd53eb7bd1e646f06b924e7b7b2a74b2d8242d4 Mon Sep 17 00:00:00 2001
From: Abhishek <maurya.abhishek@ibm.com>
Date: Mon, 27 Jan 2025 22:33:41 -0500
Subject: [PATCH 08/13] Changes for __init__ and find_layers

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
---
 .../gptqmodel/quantization/gptq.py            | 107 +++++++++---------
 1 file changed, 54 insertions(+), 53 deletions(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
index 0ac9e203..63b47b5c 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
@@ -74,35 +74,7 @@ def add_batch(self, inp, out):
             self.inp1 = inp
             self.out1 = out
         # Update entire H_list and nsamples_list
-        if self.is_moe:
-            # print("INSIDE ADD_BATCH FOR MOE: inp, type(inp), inp.shape", inp, type(inp), inp.shape)
-            for expert_idx in range(self.num_experts):    
-                H = self.H_list[expert_idx]
-                nsamples = self.nsamples_list[expert_idx]
-                
-                # if len(inp.shape) == 2:
-                #     inp = inp.unsqueeze(0)
-                # tmp = inp.shape[0]
-                # print("INSIDE ADD_BATCH FOR MOE 2: inp, inp.shape, tmp", inp, inp.shape, tmp)
-
-                # Below is doing reverse of above
-                # if len(inp.shape) == 3:
-                #     inp = inp.reshape((-1, inp.shape[-1]))
-                #     print("INSIDE ADD_BATCH FOR MOE 3: inp, inp.shape, tmp", inp, inp.shape, tmp)
-
-                tmp = 1
-                # len(inp.shape) == 2 in this case
-                mod_inp = inp.t()
-                # print("INSIDE ADD_BATCH FOR MOE 4: inp, inp.shape, tmp", inp, inp.shape, tmp)
-
-                H *= nsamples / (nsamples + tmp)
-                nsamples += tmp
-                mod_inp = math.sqrt(2 / nsamples) * mod_inp.float()
-                H += mod_inp.matmul(mod_inp.t())
-            
-                self.H_list[expert_idx] = H
-                self.nsamples_list[expert_idx] = nsamples
-        else:
+        if not self.is_moe:
             # print("INSIDE ADD_BATCH FOR 2D")
             # print("INSIDE ADD_BATCH 1: inp", inp, type(inp), inp.shape)
             if len(inp.shape) == 2:
@@ -132,6 +104,35 @@ def add_batch(self, inp, out):
             inp = math.sqrt(2 / self.nsamples) * inp.float()
             # self.H += 2 / self.nsamples * inp.matmul(inp.t())
             self.H += inp.matmul(inp.t())
+        else:
+            # print("INSIDE ADD_BATCH FOR MOE: inp, type(inp), inp.shape", inp, type(inp), inp.shape)
+            for expert_idx in range(self.num_experts):    
+                H = self.H_list[expert_idx]
+                nsamples = self.nsamples_list[expert_idx]
+                
+                # if len(inp.shape) == 2:
+                #     inp = inp.unsqueeze(0)
+                # tmp = inp.shape[0]
+                # print("INSIDE ADD_BATCH FOR MOE 2: inp, inp.shape, tmp", inp, inp.shape, tmp)
+
+                # Below is doing reverse of above
+                # if len(inp.shape) == 3:
+                #     inp = inp.reshape((-1, inp.shape[-1]))
+                #     print("INSIDE ADD_BATCH FOR MOE 3: inp, inp.shape, tmp", inp, inp.shape, tmp)
+
+                tmp = 1
+                # len(inp.shape) == 2 in this case
+                mod_inp = inp.t()
+                # print("INSIDE ADD_BATCH FOR MOE 4: inp, inp.shape, tmp", inp, inp.shape, tmp)
+
+                H *= nsamples / (nsamples + tmp)
+                nsamples += tmp
+                mod_inp = math.sqrt(2 / nsamples) * mod_inp.float()
+                H += mod_inp.matmul(mod_inp.t())
+            
+                self.H_list[expert_idx] = H
+                self.nsamples_list[expert_idx] = nsamples
+
 
     def fasterquant(
         self,
@@ -141,30 +142,7 @@ def fasterquant(
         actorder=False,
         static_groups=False,
     ):
-        if self.is_moe:
-            # For MoE model
-            # Loop over each expert param and quantize it separately 
-            t_start = time.time()
-            scale_list = []
-            zero_list = []
-            gidx_list = []
-            loss_list = []
-
-            for i in range(self.num_experts):
-                W = self.W_list[i]  # shape [out_features, in_features]
-                H = self.H_list[i]
-                nsamples = self.nsamples_list[i]
-                
-                ##### TODO: QUANTIZATION FOR MOE LAYER #####
-
-            duration = time.time() - t_start
-            final_scale = torch.cat(scale_list, dim=1) if scale_list else torch.tensor([], device=self.dev)
-            final_zero = torch.cat(zero_list, dim=1) if zero_list else torch.tensor([], device=self.dev)
-            final_gidx = torch.cat(gidx_list, dim=0)  if gidx_list else torch.tensor([], device=self.dev)
-            avg_loss = sum(loss_list) / (len(loss_list) + 1e-9)
-
-            return final_scale, final_zero, final_gidx, duration, avg_loss
-        else:
+        if not self.is_moe:
             W = self.layer.weight.data.clone()
             if isinstance(self.layer, nn.Conv2d):
                 W = W.flatten(1)
@@ -297,6 +275,29 @@ def fasterquant(
             scale = torch.cat(scale, dim=1)
             zero = torch.cat(zero, dim=1)
             return scale, zero, g_idx, duration, avg_loss
+        else:
+        # For MoE model
+            # Loop over each expert param and quantize it separately 
+            t_start = time.time()
+            scale_list = []
+            zero_list = []
+            gidx_list = []
+            loss_list = []
+
+            for i in range(self.num_experts):
+                W = self.W_list[i]  # shape [out_features, in_features]
+                H = self.H_list[i]
+                nsamples = self.nsamples_list[i]
+                
+                ##### TODO: QUANTIZATION FOR MOE LAYER #####
+
+            duration = time.time() - t_start
+            final_scale = torch.cat(scale_list, dim=1) if scale_list else torch.tensor([], device=self.dev)
+            final_zero = torch.cat(zero_list, dim=1) if zero_list else torch.tensor([], device=self.dev)
+            final_gidx = torch.cat(gidx_list, dim=0)  if gidx_list else torch.tensor([], device=self.dev)
+            avg_loss = sum(loss_list) / (len(loss_list) + 1e-9)
+
+            return final_scale, final_zero, final_gidx, duration, avg_loss
 
     def free(self):
         if os.environ.get("DEBUG"):

From 12b206a185953185f61c34ef9c5659f85e82b467 Mon Sep 17 00:00:00 2001
From: Abhishek <maurya.abhishek@ibm.com>
Date: Thu, 30 Jan 2025 00:27:34 -0500
Subject: [PATCH 09/13] Incremental change of ModuleList

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
---
 .../gptqmodel/models/granitemoe.py            |   5 +-
 .../gptqmodel/quantization/gptq.py            | 415 +++++++-----------
 .../gptqmodel/utils/model.py                  |  47 +-
 3 files changed, 206 insertions(+), 261 deletions(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
index 6552376a..22c21773 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
@@ -19,12 +19,13 @@
 
 class GraniteMoeGPTQ(BaseGPTQModel):
     base_modules = ["model.embed_tokens", "model.norm"]
+    convert3dToModuleList = ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"]
 
     layers_node = "model.layers"
     layer_type = "GraniteMoeDecoderLayer"
     layer_modules = [
         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
         ["self_attn.o_proj"],
-        ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"],
-        ["input_layernorm", "post_attention_layernorm"]
+        [f"block_sparse_moe.input_linear.{i}" for i in range(40)],
+        [f"block_sparse_moe.output_linear.{i}" for i in range(40)],
     ]
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
index 63b47b5c..c96a6d79 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
@@ -11,7 +11,6 @@
 import torch
 import torch.nn as nn
 import transformers
-import transformers.models.granitemoe.modeling_granitemoe as MOE
 
 # Local
 from .quantizer import Quantizer
@@ -26,113 +25,46 @@ class GPTQ:
     def __init__(self, layer):
         self.layer = layer
         self.dev = self.layer.weight.device
+        W = layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.pytorch_utils.Conv1D):
+            W = W.t()
+        self.rows = W.shape[0]
+        self.columns = W.shape[1]
+        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
+        self.nsamples = 0
         self.quantizer = Quantizer()
-        # print("GPTQ INIT: layer, type(layer)", layer, type(layer))
-        # print("GPTQ INIT: layer.weight.data", layer.weight.data)
-
-
-        if isinstance(layer, MOE.GraniteMoeParallelExperts):
-            self.is_moe = True
-            self.num_experts = layer.num_experts
-            self.out_features = layer.output_size
-            self.in_features = layer.input_size
-            # print("GPTQ INIT: self.num_experts, self.out_features, self.in_features", self.num_experts, self.out_features, self.in_features)
-            
-            # Separate W for each expert
-            self.W_list, self.H_list, self.nsamples_list = [], [], []
-            for i in range(self.num_experts):
-                # Each expert slice is of shape [out_features, in_features]
-                self.W_list.append(layer.weight.data[i].clone())
-
-                # For each expert param, we have a Hessian and sample count
-                self.H_list.append(torch.zeros((self.in_features, self.in_features), device=self.dev))
-                self.nsamples_list.append(0)
-            
-            # print("GPTQ INIT: self.W_list, len(self.W_list), type(self.W_list[0]), self.W_list[0].size(), self.W_list[0].dim()", self.W_list, len(self.W_list), type(self.W_list[0]), self.W_list[0].size(), self.W_list[0].dim())
-            # print("GPTQ INIT: self.H_list, len(self.H_list), type(self.H_list[0]), self.H_list[0].size(), self.H_list[0].dim()", self.H_list, len(self.H_list), type(self.H_list[0]), self.H_list[0].size(), self.H_list[0].dim())
-
-        else:
-            # For 2D layer (linear, conv, etc.), we have a single Hessian and sample count
-            self.is_moe = False
-            W = layer.weight.data.clone()
-            if isinstance(layer, nn.Conv2d):
-                W = W.flatten(1)
-            if isinstance(layer, transformers.pytorch_utils.Conv1D):
-                W = W.t()
-            # print("GPTQ INIT: W, type(W), W.size(), W.dim()", W, type(W), W.size(), W.dim())
-
-            self.rows = W.shape[0]
-            self.columns = W.shape[1]
-            self.H = torch.zeros((self.columns, self.columns), device=self.dev)
-            self.nsamples = 0
-            # print("GPTQ INIT: self.H, type(self.H), self.H.size(), self.H.dim()", self.H, type(self.H), self.H.size(), self.H.dim())
-            # print("GPTQ INIT: self.rows, self.columns", self.rows, self.columns)
-        
 
     def add_batch(self, inp, out):
         if os.environ.get("DEBUG"):
             self.inp1 = inp
             self.out1 = out
-        # Update entire H_list and nsamples_list
-        if not self.is_moe:
-            # print("INSIDE ADD_BATCH FOR 2D")
-            # print("INSIDE ADD_BATCH 1: inp", inp, type(inp), inp.shape)
-            if len(inp.shape) == 2:
-                inp = inp.unsqueeze(0)
-            tmp = inp.shape[0]
-            if isinstance(self.layer, nn.Linear) or isinstance(
-                self.layer, transformers.Conv1D
-            ):
-                if len(inp.shape) == 3:
-                    inp = inp.reshape((-1, inp.shape[-1]))
-                inp = inp.t()
-            if isinstance(self.layer, nn.Conv2d):
-                unfold = nn.Unfold(
-                    self.layer.kernel_size,
-                    dilation=self.layer.dilation,
-                    padding=self.layer.padding,
-                    stride=self.layer.stride,
-                )
-                inp = unfold(inp)
-                inp = inp.permute([1, 0, 2])
-                inp = inp.flatten(1)
-            # print("INSIDE ADD_BATCH 2: BEFORE tmp, self.H, self.nsamples", tmp, self.H, self.nsamples)
-            self.H *= self.nsamples / (self.nsamples + tmp)
-            self.nsamples += tmp
-            # print("INSIDE ADD_BATCH 3: AFTER tmp, self.H, self.nsamples", tmp, self.H, self.nsamples)
-            # inp = inp.float()
-            inp = math.sqrt(2 / self.nsamples) * inp.float()
-            # self.H += 2 / self.nsamples * inp.matmul(inp.t())
-            self.H += inp.matmul(inp.t())
-        else:
-            # print("INSIDE ADD_BATCH FOR MOE: inp, type(inp), inp.shape", inp, type(inp), inp.shape)
-            for expert_idx in range(self.num_experts):    
-                H = self.H_list[expert_idx]
-                nsamples = self.nsamples_list[expert_idx]
-                
-                # if len(inp.shape) == 2:
-                #     inp = inp.unsqueeze(0)
-                # tmp = inp.shape[0]
-                # print("INSIDE ADD_BATCH FOR MOE 2: inp, inp.shape, tmp", inp, inp.shape, tmp)
-
-                # Below is doing reverse of above
-                # if len(inp.shape) == 3:
-                #     inp = inp.reshape((-1, inp.shape[-1]))
-                #     print("INSIDE ADD_BATCH FOR MOE 3: inp, inp.shape, tmp", inp, inp.shape, tmp)
-
-                tmp = 1
-                # len(inp.shape) == 2 in this case
-                mod_inp = inp.t()
-                # print("INSIDE ADD_BATCH FOR MOE 4: inp, inp.shape, tmp", inp, inp.shape, tmp)
-
-                H *= nsamples / (nsamples + tmp)
-                nsamples += tmp
-                mod_inp = math.sqrt(2 / nsamples) * mod_inp.float()
-                H += mod_inp.matmul(mod_inp.t())
-            
-                self.H_list[expert_idx] = H
-                self.nsamples_list[expert_idx] = nsamples
-
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        tmp = inp.shape[0]
+        if isinstance(self.layer, nn.Linear) or isinstance(
+            self.layer, transformers.Conv1D
+        ):
+            if len(inp.shape) == 3:
+                inp = inp.reshape((-1, inp.shape[-1]))
+            inp = inp.t()
+        if isinstance(self.layer, nn.Conv2d):
+            unfold = nn.Unfold(
+                self.layer.kernel_size,
+                dilation=self.layer.dilation,
+                padding=self.layer.padding,
+                stride=self.layer.stride,
+            )
+            inp = unfold(inp)
+            inp = inp.permute([1, 0, 2])
+            inp = inp.flatten(1)
+        self.H *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+        # inp = inp.float()
+        inp = math.sqrt(2 / self.nsamples) * inp.float()
+        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
+        self.H += inp.matmul(inp.t())
 
     def fasterquant(
         self,
@@ -142,162 +74,137 @@ def fasterquant(
         actorder=False,
         static_groups=False,
     ):
-        if not self.is_moe:
-            W = self.layer.weight.data.clone()
-            if isinstance(self.layer, nn.Conv2d):
-                W = W.flatten(1)
-            if isinstance(self.layer, transformers.Conv1D):
-                W = W.t()
-            W = W.float()
-
-            tick = time.time()
-
-            if not self.quantizer.ready():
-                self.quantizer.find_params(W, weight=True)
-
-            H = self.H
-            del self.H
-            dead = torch.diag(H) == 0
-            H[dead, dead] = 1
-            W[:, dead] = 0
-
-            g_idx = []
-            scale = []
-            zero = []
-            now_idx = 1
-
-            if static_groups:
-                # Standard
-                import copy
-
-                groups = []
-                for i in range(0, self.columns, group_size):
-                    quantizer = copy.deepcopy(self.quantizer)
-                    quantizer.find_params(W[:, i : (i + group_size)], weight=True)
-                    scale.append(quantizer.scale)
-                    zero.append(quantizer.zero)
-                    groups.append(quantizer)
-
-            if actorder:
-                perm = torch.argsort(torch.diag(H), descending=True)
-                W = W[:, perm]
-                H = H[perm][:, perm]
-                invperm = torch.argsort(perm)
-
-            Losses = torch.zeros_like(W)
-            Q = torch.zeros_like(W)
-
-            damp = percdamp * torch.mean(torch.diag(H))
-            diag = torch.arange(self.columns, device=self.dev)
-            H[diag, diag] += damp
-            H = torch.linalg.cholesky(H)
-            H = torch.cholesky_inverse(H)
-            H = torch.linalg.cholesky(H, upper=True)
-            Hinv = H
-
-            for i1 in range(0, self.columns, blocksize):
-                i2 = min(i1 + blocksize, self.columns)
-                count = i2 - i1
-
-                W1 = W[:, i1:i2].clone()
-                Q1 = torch.zeros_like(W1)
-                Err1 = torch.zeros_like(W1)
-                Losses1 = torch.zeros_like(W1)
-                Hinv1 = Hinv[i1:i2, i1:i2]
-
-                for i in range(count):
-                    w = W1[:, i]
-                    d = Hinv1[i, i]
-
-                    if group_size != -1:
-                        if not static_groups:
-                            if (i1 + i) % group_size == 0:
-                                self.quantizer.find_params(
-                                    W[:, (i1 + i) : (i1 + i + group_size)], weight=True
-                                )
-
-                            if ((i1 + i) // group_size) - now_idx == -1:
-                                scale.append(self.quantizer.scale)
-                                zero.append(self.quantizer.zero)
-                                now_idx += 1
-                        else:
-                            idx = i1 + i
-                            if actorder:
-                                idx = perm[idx]
-                            self.quantizer = groups[idx // group_size]
-
-                    q = self.quantizer.quantize(w.unsqueeze(1)).flatten()
-                    Q1[:, i] = q
-                    Losses1[:, i] = (w - q) ** 2 / d**2
-
-                    err1 = (w - q) / d
-                    W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
-                    Err1[:, i] = err1
-
-                Q[:, i1:i2] = Q1
-                Losses[:, i1:i2] = Losses1 / 2
-
-                W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
-
-                if os.environ.get("DEBUG"):
-                    self.layer.weight.data[:, :i2] = Q[:, :i2]
-                    self.layer.weight.data[:, i2:] = W[:, i2:]
-                    logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
-                    logger.debug(torch.sum(Losses))
-
-            torch.cuda.synchronize()
-
-            duration = time.time() - tick
-            avg_loss = torch.sum(Losses).item() / self.nsamples
-
-            group_size = group_size if group_size != -1 else self.columns
-            if static_groups and actorder:
-                g_idx = [perm[i] // group_size for i in range(self.columns)]
-            else:
-                g_idx = [i // group_size for i in range(self.columns)]
-            g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
-            if actorder:
-                Q = Q[:, invperm]
-                g_idx = g_idx[invperm]
-
-            if isinstance(self.layer, transformers.Conv1D):
-                Q = Q.t()
-            self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as(
-                self.layer.weight.data
-            )
+        W = self.layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        W = W.float()
+
+        tick = time.time()
+
+        if not self.quantizer.ready():
+            self.quantizer.find_params(W, weight=True)
+
+        H = self.H
+        del self.H
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+        W[:, dead] = 0
+
+        g_idx = []
+        scale = []
+        zero = []
+        now_idx = 1
+
+        if static_groups:
+            # Standard
+            import copy
+
+            groups = []
+            for i in range(0, self.columns, group_size):
+                quantizer = copy.deepcopy(self.quantizer)
+                quantizer.find_params(W[:, i : (i + group_size)], weight=True)
+                scale.append(quantizer.scale)
+                zero.append(quantizer.zero)
+                groups.append(quantizer)
+
+        if actorder:
+            perm = torch.argsort(torch.diag(H), descending=True)
+            W = W[:, perm]
+            H = H[perm][:, perm]
+            invperm = torch.argsort(perm)
+
+        Losses = torch.zeros_like(W)
+        Q = torch.zeros_like(W)
+
+        damp = percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(self.columns, device=self.dev)
+        H[diag, diag] += damp
+        H = torch.linalg.cholesky(H)
+        H = torch.cholesky_inverse(H)
+        H = torch.linalg.cholesky(H, upper=True)
+        Hinv = H
+
+        for i1 in range(0, self.columns, blocksize):
+            i2 = min(i1 + blocksize, self.columns)
+            count = i2 - i1
+
+            W1 = W[:, i1:i2].clone()
+            Q1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+
+            for i in range(count):
+                w = W1[:, i]
+                d = Hinv1[i, i]
+
+                if group_size != -1:
+                    if not static_groups:
+                        if (i1 + i) % group_size == 0:
+                            self.quantizer.find_params(
+                                W[:, (i1 + i) : (i1 + i + group_size)], weight=True
+                            )
+
+                        if ((i1 + i) // group_size) - now_idx == -1:
+                            scale.append(self.quantizer.scale)
+                            zero.append(self.quantizer.zero)
+                            now_idx += 1
+                    else:
+                        idx = i1 + i
+                        if actorder:
+                            idx = perm[idx]
+                        self.quantizer = groups[idx // group_size]
+
+                q = self.quantizer.quantize(w.unsqueeze(1)).flatten()
+                Q1[:, i] = q
+                Losses1[:, i] = (w - q) ** 2 / d**2
+
+                err1 = (w - q) / d
+                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                Err1[:, i] = err1
+
+            Q[:, i1:i2] = Q1
+            Losses[:, i1:i2] = Losses1 / 2
+
+            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
 
             if os.environ.get("DEBUG"):
+                self.layer.weight.data[:, :i2] = Q[:, :i2]
+                self.layer.weight.data[:, i2:] = W[:, i2:]
                 logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+                logger.debug(torch.sum(Losses))
+
+        torch.cuda.synchronize()
+
+        duration = time.time() - tick
+        avg_loss = torch.sum(Losses).item() / self.nsamples
 
-            if scale == []:
-                scale.append(self.quantizer.scale)
-                zero.append(self.quantizer.zero)
-            scale = torch.cat(scale, dim=1)
-            zero = torch.cat(zero, dim=1)
-            return scale, zero, g_idx, duration, avg_loss
+        group_size = group_size if group_size != -1 else self.columns
+        if static_groups and actorder:
+            g_idx = [perm[i] // group_size for i in range(self.columns)]
         else:
-        # For MoE model
-            # Loop over each expert param and quantize it separately 
-            t_start = time.time()
-            scale_list = []
-            zero_list = []
-            gidx_list = []
-            loss_list = []
-
-            for i in range(self.num_experts):
-                W = self.W_list[i]  # shape [out_features, in_features]
-                H = self.H_list[i]
-                nsamples = self.nsamples_list[i]
-                
-                ##### TODO: QUANTIZATION FOR MOE LAYER #####
-
-            duration = time.time() - t_start
-            final_scale = torch.cat(scale_list, dim=1) if scale_list else torch.tensor([], device=self.dev)
-            final_zero = torch.cat(zero_list, dim=1) if zero_list else torch.tensor([], device=self.dev)
-            final_gidx = torch.cat(gidx_list, dim=0)  if gidx_list else torch.tensor([], device=self.dev)
-            avg_loss = sum(loss_list) / (len(loss_list) + 1e-9)
-
-            return final_scale, final_zero, final_gidx, duration, avg_loss
+            g_idx = [i // group_size for i in range(self.columns)]
+        g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
+        if actorder:
+            Q = Q[:, invperm]
+            g_idx = g_idx[invperm]
+
+        if isinstance(self.layer, transformers.Conv1D):
+            Q = Q.t()
+        self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as(
+            self.layer.weight.data
+        )
+        if os.environ.get("DEBUG"):
+            logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+
+        if scale == []:
+            scale.append(self.quantizer.scale)
+            zero.append(self.quantizer.zero)
+        scale = torch.cat(scale, dim=1)
+        zero = torch.cat(zero, dim=1)
+        return scale, zero, g_idx, duration, avg_loss
 
     def free(self):
         if os.environ.get("DEBUG"):
@@ -309,4 +216,4 @@ def free(self):
         torch.cuda.empty_cache()
 
 
-__all__ = ["GPTQ"]
+__all__ = ["GPTQ"]
\ No newline at end of file
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
index 39715177..d7c67bab 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
@@ -30,6 +30,7 @@
 import threadpoolctl as tctl
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import transformers
 import transformers.models.granitemoe.modeling_granitemoe as MOE
 
@@ -53,6 +54,24 @@
 logger.addHandler(handler)
 logger.setLevel(logging.INFO)
 
+class ThreeDTensorModuleList(nn.ModuleList):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        # Shape of input: (num_selected_experts * batch_size (expert_size), input_features_size)
+        expert_size = len(self)
+        input_list = inputs.split(expert_size, dim=0)
+        output_list = []
+        
+        # Iterate over the number of selected experts and apply each expert to the corresponding input
+        for i in range(len(self)):  
+            # Shape of input_list[i]: (batch_size, input_features_size); Shape of self[i]: (output_features_size, input_features_size) 
+            # Shape of output: (batch_size, output_features_size); 
+            expert_output = F.linear(input_list[i], self[i])
+            output_list.append(expert_output)
+
+        # Concatenate the outputs along the first dimension
+        results = torch.cat(output_list, dim=0)  # Shape: (num_selected_experts * batch_size, output_features_size)
+        return results
+
 
 def recurse_getattr(obj, attr: str):
     """
@@ -101,22 +120,40 @@ def nested_move_to(v, device):
         return v
 
 
+def check3DTensor(module, name, convert3dToModuleList=["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"]):
+    if convert3dToModuleList and name in convert3dToModuleList:
+        # print("INSIDE check3DTensor module, name, convert3dToModuleList", module, name, convert3dToModuleList)
+        num_experts = module.num_experts
+        input_size = module.input_size
+        output_size = module.output_size
+        module = ThreeDTensorModuleList([
+            nn.Linear(input_size, output_size, bias=False) for _ in range(num_experts)
+        ])
+
+    return module
+
+
 def find_layers(module, layers=None, name=""):
     # print("1- INSIDE find_layers module", module)
+    module = check3DTensor(module, name)
+    # print("2- AFTER check3DTensor module", module)
     if not layers:
-        # Can add MOE.GraniteMoeRMSNorm here if want to include Linear Norm layer ["input_layernorm", "post_attention_layernorm"]
-        # MOE.GraniteMoeParallelExperts is torch.nn.Module for layer ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"]
-        layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear, MOE.GraniteMoeParallelExperts] 
+        layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear] 
 
-    # print("2- LAYERS, type(module), name", layers, type(module), name)
+    # print("2- INFO: type(module), name", type(module), name)
     # if hasattr(module, "weight"):
-    #     print("3- module.weight, type(module.weight), module.weight.shape, module.weight.ndim", module.weight, type(module.weight), module.weight.shape, module.weight.ndim)
+    #     print("3- type(module.weight), module.weight.shape, module.weight.ndim", type(module.weight), module.weight.shape, module.weight.ndim)
     for layer in layers:
         if isinstance(module, layer):
             return {name: module}
 
     res = {}
+    # if isinstance(module, MOE.GraniteMoeParallelExperts):
+    #     print("Print GraniteMoeParallelExperts Layer children")
+    #     for name1, child in module.named_children():
+    #         print("4- name1, child", name1, child)
     for name1, child in module.named_children():
+        # print("PROCESS- name, name1, child", name, name1, child)
         res.update(
             find_layers(
                 child, layers=layers, name=name + "." + name1 if name != "" else name1

From b0f92720b0ffcebdfb5f7282050d5fa00685bc45 Mon Sep 17 00:00:00 2001
From: Abhishek <maurya.abhishek@ibm.com>
Date: Thu, 30 Jan 2025 00:31:26 -0500
Subject: [PATCH 10/13] Incremental change of ModuleList

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
---
 .../src/fms_acceleration_peft/gptqmodel/quantization/gptq.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
index c96a6d79..470ed3fb 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
@@ -216,4 +216,4 @@ def free(self):
         torch.cuda.empty_cache()
 
 
-__all__ = ["GPTQ"]
\ No newline at end of file
+__all__ = ["GPTQ"]

From 58d27227d67d8d99483703ed1153223e6c354371 Mon Sep 17 00:00:00 2001
From: Abhishek <maurya.abhishek@ibm.com>
Date: Thu, 30 Jan 2025 20:36:03 -0500
Subject: [PATCH 11/13] Suggested changes

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
---
 .../gptqmodel/models/_const.py                |  2 +-
 .../gptqmodel/models/auto.py                  |  2 +-
 .../gptqmodel/models/base.py                  | 64 +++++++++++++++++
 .../gptqmodel/models/granitemoe.py            |  5 +-
 .../gptqmodel/utils/model.py                  | 69 ++++++-------------
 5 files changed, 91 insertions(+), 51 deletions(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py
index 14e10054..087dd034 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py
@@ -28,7 +28,7 @@
     "granite",
     "gemma",
     "dbrx_converted",
-    "granitemoe"
+    "granitemoe",
 ]
 
 EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py
index 0b1be8fc..d0caf2a4 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py
@@ -44,7 +44,7 @@
     "granite": GraniteGPTQ,
     "dbrx": DbrxGPTQ,
     "dbrx_converted": DbrxConvertedGPTQ,
-    "granitemoe": GraniteMoeGPTQ
+    "granitemoe": GraniteMoeGPTQ,
 }
 
 at_least_one_cuda_v6 = any(
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
index b3921ff2..1dc07d86 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
@@ -41,6 +41,7 @@
 import accelerate
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import transformers
 
 # Local
@@ -61,6 +62,7 @@
     convert_gptq_v1_to_v2_format,
     convert_gptq_v2_to_v1_format,
     find_layers,
+    get_all_modules_by_name_suffix,
     get_checkpoints,
     get_device,
     get_module_by_name_prefix,
@@ -91,6 +93,9 @@ class BaseGPTQModel(nn.Module):
     # does not include the node which holds all the repeating layers
     base_modules: List[str] = None
 
+    # 3D Module to be converted to ModuleList
+    convert_3d_modulelist: List[str] = None
+
     # name of lm_head
     lm_head: str = "lm_head"
 
@@ -223,6 +228,22 @@ def quantize(
         if len(calibration_dataset) == 0:
             raise ValueError("Calibration dataset must not be empty.")
 
+        ##### SWAP 3D MODULES TO MODULELIST #####
+        if self.convert_3d_modulelist:
+            for name in self.convert_3d_modulelist:
+                matches = get_all_modules_by_name_suffix(self.model, name)
+                for parent, module, full_name in matches:
+
+                    # Modify the matched module
+                    if parent is not None:
+                        new_module = self.swap_3d_tensors(module)
+
+                        # Replace the old module with the new one
+                        # Derive the child attribute name from the tail of full_name
+                        child_name = full_name.split(".")[-1]
+
+                        setattr(parent, child_name, new_module)
+
         min_calibration_dataset_size = 256
         min_calibration_dataset_input_ids_avg_length = 256
 
@@ -1211,5 +1232,48 @@ def __getattr__(self, item):
         except Exception:
             return getattr(self.model, item)
 
+    def swap_3d_tensors(self, module: nn.Module) -> nn.ModuleList:
+        """Swap 3D Parameters to ModuleList of 3D Parameters."""
+
+        num_experts = module.num_experts
+        input_size = module.input_size
+        output_size = module.output_size
+        module = MoE3DModuleList(
+            [nn.Linear(input_size, output_size, bias=False) for _ in range(num_experts)]
+        )
+        return module
+
+
+class MoE3DModuleList(nn.ModuleList):
+    def forward(self, inputs: torch.Tensor, expert_size: int) -> torch.Tensor:
+        """
+        Forward pass of the MoE3DModuleList module.
+        Args:
+            inputs (Tensor):
+                Input tensor.
+            expert_size:
+                Expert size information.
+        Returns:
+            Tensor: Output tensor.
+        """
+        input_list = inputs.split(expert_size, dim=0)
+        output_list = []
+
+        # Iterate over the number of selected experts and apply each expert to the corresponding input
+        for i in range(len(expert_size)):
+            # Extract weight and bias from the Linear module
+            weight = self[i].weight.to(device=inputs.device, dtype=inputs.dtype)
+            bias = (
+                self[i].bias.to(device=inputs.device, dtype=inputs.dtype)
+                if self[i].bias is not None
+                else None
+            )
+            expert_output = F.linear(input_list[i], weight, bias)
+            output_list.append(expert_output)
+
+        # Concatenate the outputs along the first dimension
+        results = torch.cat(output_list, dim=0)
+        return results
+
 
 __all__ = ["BaseGPTQModel"]
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
index 22c21773..1f360d27 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
@@ -19,7 +19,10 @@
 
 class GraniteMoeGPTQ(BaseGPTQModel):
     base_modules = ["model.embed_tokens", "model.norm"]
-    convert3dToModuleList = ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"]
+    convert_3d_modulelist = [
+        "block_sparse_moe.input_linear",
+        "block_sparse_moe.output_linear",
+    ]
 
     layers_node = "model.layers"
     layer_type = "GraniteMoeDecoderLayer"
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
index d7c67bab..53f01843 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
@@ -15,7 +15,7 @@
 ###############################################################################
 # Standard
 from logging import getLogger
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 import functools
 import hashlib
 import json
@@ -30,9 +30,7 @@
 import threadpoolctl as tctl
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 import transformers
-import transformers.models.granitemoe.modeling_granitemoe as MOE
 
 # Local
 from ..models._const import (
@@ -54,24 +52,6 @@
 logger.addHandler(handler)
 logger.setLevel(logging.INFO)
 
-class ThreeDTensorModuleList(nn.ModuleList):
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        # Shape of input: (num_selected_experts * batch_size (expert_size), input_features_size)
-        expert_size = len(self)
-        input_list = inputs.split(expert_size, dim=0)
-        output_list = []
-        
-        # Iterate over the number of selected experts and apply each expert to the corresponding input
-        for i in range(len(self)):  
-            # Shape of input_list[i]: (batch_size, input_features_size); Shape of self[i]: (output_features_size, input_features_size) 
-            # Shape of output: (batch_size, output_features_size); 
-            expert_output = F.linear(input_list[i], self[i])
-            output_list.append(expert_output)
-
-        # Concatenate the outputs along the first dimension
-        results = torch.cat(output_list, dim=0)  # Shape: (num_selected_experts * batch_size, output_features_size)
-        return results
-
 
 def recurse_getattr(obj, attr: str):
     """
@@ -120,40 +100,14 @@ def nested_move_to(v, device):
         return v
 
 
-def check3DTensor(module, name, convert3dToModuleList=["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"]):
-    if convert3dToModuleList and name in convert3dToModuleList:
-        # print("INSIDE check3DTensor module, name, convert3dToModuleList", module, name, convert3dToModuleList)
-        num_experts = module.num_experts
-        input_size = module.input_size
-        output_size = module.output_size
-        module = ThreeDTensorModuleList([
-            nn.Linear(input_size, output_size, bias=False) for _ in range(num_experts)
-        ])
-
-    return module
-
-
 def find_layers(module, layers=None, name=""):
-    # print("1- INSIDE find_layers module", module)
-    module = check3DTensor(module, name)
-    # print("2- AFTER check3DTensor module", module)
     if not layers:
-        layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear] 
-
-    # print("2- INFO: type(module), name", type(module), name)
-    # if hasattr(module, "weight"):
-    #     print("3- type(module.weight), module.weight.shape, module.weight.ndim", type(module.weight), module.weight.shape, module.weight.ndim)
+        layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear]
     for layer in layers:
         if isinstance(module, layer):
             return {name: module}
-
     res = {}
-    # if isinstance(module, MOE.GraniteMoeParallelExperts):
-    #     print("Print GraniteMoeParallelExperts Layer children")
-    #     for name1, child in module.named_children():
-    #         print("4- name1, child", name1, child)
     for name1, child in module.named_children():
-        # print("PROCESS- name, name1, child", name, name1, child)
         res.update(
             find_layers(
                 child, layers=layers, name=name + "." + name1 if name != "" else name1
@@ -174,6 +128,25 @@ def get_module_by_name_suffix(model, module_name: str):
             return module
 
 
+def get_all_modules_by_name_suffix(
+    model: nn.Module, target_suffix: str
+) -> List[Tuple[Optional[nn.Module], nn.Module, str]]:
+    """Find all modules in the model whose names end with the given suffix, along with their parent modules."""
+    name_to_module = dict(model.named_modules())
+    results = []
+    for full_name, mod in name_to_module.items():
+        if full_name.endswith(target_suffix):
+            split_name = full_name.split(".")
+            if len(split_name) > 1:
+                parent_name = ".".join(split_name[:-1])
+            else:
+                parent_name = ""
+
+            parent_module = name_to_module.get(parent_name, None)
+            results.append((parent_module, mod, full_name))
+    return results
+
+
 def make_quant(
     module,
     names,

From 74d18b7fd824eef312a81a771223851c6639c702 Mon Sep 17 00:00:00 2001
From: Abhishek <maurya.abhishek@ibm.com>
Date: Tue, 18 Feb 2025 19:23:43 -0500
Subject: [PATCH 12/13] Test Fabian's code

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
---
 .../gptqmodel/models/base.py                  | 79 ++++---------------
 .../gptqmodel/models/granitemoe.py            | 26 +++++-
 .../gptqmodel/utils/model.py                  | 51 +++++++-----
 3 files changed, 69 insertions(+), 87 deletions(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
index 29a74b47..1863c4d0 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
@@ -15,7 +15,8 @@
 ###############################################################################
 # Standard
 from os.path import isfile, join
-from typing import Dict, List, Optional, Union
+from types import MethodType
+from typing import Callable, Dict, List, Optional, Tuple, Union
 import copy
 import json
 import logging
@@ -44,7 +45,6 @@
 import accelerate
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 import transformers
 
 # Local
@@ -65,7 +65,6 @@
     convert_gptq_v1_to_v2_format,
     convert_gptq_v2_to_v1_format,
     find_layers,
-    get_all_modules_by_name_suffix,
     get_checkpoints,
     get_device,
     get_module_by_name_prefix,
@@ -79,6 +78,7 @@
     simple_dispatch_model,
     verify_model_hash,
     verify_sharded_model_hashes,
+    replace_3d_parameters_with_module_list,
 )
 from ._const import CPU, CUDA_0, SUPPORTED_MODELS
 
@@ -96,8 +96,11 @@ class BaseGPTQModel(nn.Module):
     # does not include the node which holds all the repeating layers
     base_modules: List[str] = None
 
-    # 3D Module to be converted to ModuleList
-    convert_3d_modulelist: List[str] = None
+    # If 3D Parameters to be converted
+    convert3dparameters: bool = False
+
+    # User provided forward pass to replace the existing forward pass 
+    update_forwards: List[Tuple[str, Callable]] = None
 
     # name of lm_head
     lm_head: str = "lm_head"
@@ -133,6 +136,13 @@ def __init__(
         super().__init__()
 
         self.model = model
+        if self.convert3dparameters:
+            model = replace_3d_parameters_with_module_list(model)
+            for mod in model.modules():
+                forward = self.update_forwards.get(mod.__class__.__name__)
+                if forward is not None:
+                    mod.forward = MethodType(forward, mod)
+
         self.model_type = self.model.config.model_type
         self._quantized = quantized
         self.quantize_config = quantize_config
@@ -231,22 +241,6 @@ def quantize(
         if len(calibration_dataset) == 0:
             raise ValueError("Calibration dataset must not be empty.")
 
-        ##### SWAP 3D MODULES TO MODULELIST #####
-        if self.convert_3d_modulelist:
-            for name in self.convert_3d_modulelist:
-                matches = get_all_modules_by_name_suffix(self.model, name)
-                for parent, module, full_name in matches:
-
-                    # Modify the matched module
-                    if parent is not None:
-                        new_module = self.swap_3d_tensors(module)
-
-                        # Replace the old module with the new one
-                        # Derive the child attribute name from the tail of full_name
-                        child_name = full_name.split(".")[-1]
-
-                        setattr(parent, child_name, new_module)
-
         min_calibration_dataset_size = 256
         min_calibration_dataset_input_ids_avg_length = 256
 
@@ -1335,48 +1329,5 @@ def __getattr__(self, item):
         except Exception:
             return getattr(self.model, item)
 
-    def swap_3d_tensors(self, module: nn.Module) -> nn.ModuleList:
-        """Swap 3D Parameters to ModuleList of 3D Parameters."""
-
-        num_experts = module.num_experts
-        input_size = module.input_size
-        output_size = module.output_size
-        module = MoE3DModuleList(
-            [nn.Linear(input_size, output_size, bias=False) for _ in range(num_experts)]
-        )
-        return module
-
-
-class MoE3DModuleList(nn.ModuleList):
-    def forward(self, inputs: torch.Tensor, expert_size: int) -> torch.Tensor:
-        """
-        Forward pass of the MoE3DModuleList module.
-        Args:
-            inputs (Tensor):
-                Input tensor.
-            expert_size:
-                Expert size information.
-        Returns:
-            Tensor: Output tensor.
-        """
-        input_list = inputs.split(expert_size, dim=0)
-        output_list = []
-
-        # Iterate over the number of selected experts and apply each expert to the corresponding input
-        for i in range(len(expert_size)):
-            # Extract weight and bias from the Linear module
-            weight = self[i].weight.to(device=inputs.device, dtype=inputs.dtype)
-            bias = (
-                self[i].bias.to(device=inputs.device, dtype=inputs.dtype)
-                if self[i].bias is not None
-                else None
-            )
-            expert_output = F.linear(input_list[i], weight, bias)
-            output_list.append(expert_output)
-
-        # Concatenate the outputs along the first dimension
-        results = torch.cat(output_list, dim=0)
-        return results
-
 
 __all__ = ["BaseGPTQModel"]
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
index 1f360d27..451a1a8e 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
@@ -15,13 +15,33 @@
 ###############################################################################
 # Local
 from .base import BaseGPTQModel
+import torch
+
+def new_forward(self, inputs, expert_size):
+    """
+    Forward pass of the GraniteMoeParallelExperts module.
+    Args:
+        inputs (Tensor):
+            Input tensor.
+        expert_size:
+            Expert size information.
+    Returns:
+        Tensor: Output tensor.
+    """
+    input_list = inputs.split(expert_size, dim=0)
+    output_list = []
+    for i in range(self.num_experts):
+        # the key is we need to use call the module
+        output_list.append(self.weight[i](input_list[i]))
+    results = torch.cat(output_list, dim=0)
+    return results
 
 
 class GraniteMoeGPTQ(BaseGPTQModel):
     base_modules = ["model.embed_tokens", "model.norm"]
-    convert_3d_modulelist = [
-        "block_sparse_moe.input_linear",
-        "block_sparse_moe.output_linear",
+    convert3dparameters = True
+    update_forwards = [
+       ("GraniteMoeParallelExperts", new_forward) 
     ]
 
     layers_node = "model.layers"
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
index 53f01843..2c75eef5 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
@@ -15,7 +15,7 @@
 ###############################################################################
 # Standard
 from logging import getLogger
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Union
 import functools
 import hashlib
 import json
@@ -128,25 +128,6 @@ def get_module_by_name_suffix(model, module_name: str):
             return module
 
 
-def get_all_modules_by_name_suffix(
-    model: nn.Module, target_suffix: str
-) -> List[Tuple[Optional[nn.Module], nn.Module, str]]:
-    """Find all modules in the model whose names end with the given suffix, along with their parent modules."""
-    name_to_module = dict(model.named_modules())
-    results = []
-    for full_name, mod in name_to_module.items():
-        if full_name.endswith(target_suffix):
-            split_name = full_name.split(".")
-            if len(split_name) > 1:
-                parent_name = ".".join(split_name[:-1])
-            else:
-                parent_name = ""
-
-            parent_module = name_to_module.get(parent_name, None)
-            results.append((parent_module, mod, full_name))
-    return results
-
-
 def make_quant(
     module,
     names,
@@ -734,3 +715,33 @@ def get_moe_layer_modules(layer_modules: List, num_experts: int) -> List:
                 new_inside_layer_modules[-1].append(n)
 
     return new_inside_layer_modules
+
+
+def replace_3d_parameters_with_module_list(
+    model: torch.nn.Module,
+):
+
+    for name, module in model.named_modules():
+        for param_name, param in module.named_parameters(recurse=False):
+            if len(param.shape) == 3:
+                device = param.device
+                dtype = param.dtype
+                num, in_features, out_features = param.shape
+
+                module_list = []
+                for i in range(num):
+                    linear = torch.nn.Linear(
+                        in_features=in_features,
+                        out_features=out_features,
+                        device=device,
+                        dtype=dtype,
+                        bias=None, # FIXME: how to support bias?
+                    )
+                    linear.weight.data = param.data[i]
+                    module_list.append(linear)
+
+                module_list = torch.nn.ModuleList(module_list)
+
+                # replace
+                delattr(module, param_name)
+                setattr(module, param_name, module_list)
\ No newline at end of file

From f98e85dadfc4c6d6f61c4f18aedc9ea1c165ac5d Mon Sep 17 00:00:00 2001
From: Abhishek <maurya.abhishek@ibm.com>
Date: Tue, 18 Feb 2025 20:57:40 -0500
Subject: [PATCH 13/13] Test Fabian's code

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
---
 .../fms_acceleration_peft/gptqmodel/models/base.py  |  6 +++---
 .../gptqmodel/models/granitemoe.py                  | 13 +++++++------
 .../fms_acceleration_peft/gptqmodel/utils/model.py  |  4 ++--
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
index 1863c4d0..3ee3f4fb 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
@@ -75,10 +75,10 @@
     move_to,
     nested_move_to,
     pack_model,
+    replace_3d_parameters_with_module_list,
     simple_dispatch_model,
     verify_model_hash,
     verify_sharded_model_hashes,
-    replace_3d_parameters_with_module_list,
 )
 from ._const import CPU, CUDA_0, SUPPORTED_MODELS
 
@@ -99,7 +99,7 @@ class BaseGPTQModel(nn.Module):
     # If 3D Parameters to be converted
     convert3dparameters: bool = False
 
-    # User provided forward pass to replace the existing forward pass 
+    # User provided forward pass to replace the existing forward pass
     update_forwards: List[Tuple[str, Callable]] = None
 
     # name of lm_head
@@ -137,7 +137,7 @@ def __init__(
 
         self.model = model
         if self.convert3dparameters:
-            model = replace_3d_parameters_with_module_list(model)
+            replace_3d_parameters_with_module_list(model)
             for mod in model.modules():
                 forward = self.update_forwards.get(mod.__class__.__name__)
                 if forward is not None:
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
index 451a1a8e..87eb3103 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/granitemoe.py
@@ -13,9 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Third Party
+import torch
+
 # Local
 from .base import BaseGPTQModel
-import torch
+
 
 def new_forward(self, inputs, expert_size):
     """
@@ -40,15 +43,13 @@ def new_forward(self, inputs, expert_size):
 class GraniteMoeGPTQ(BaseGPTQModel):
     base_modules = ["model.embed_tokens", "model.norm"]
     convert3dparameters = True
-    update_forwards = [
-       ("GraniteMoeParallelExperts", new_forward) 
-    ]
+    update_forwards = {"GraniteMoeParallelExperts": new_forward}
 
     layers_node = "model.layers"
     layer_type = "GraniteMoeDecoderLayer"
     layer_modules = [
         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
         ["self_attn.o_proj"],
-        [f"block_sparse_moe.input_linear.{i}" for i in range(40)],
-        [f"block_sparse_moe.output_linear.{i}" for i in range(40)],
+        [f"block_sparse_moe.input_linear.weight.{i}" for i in range(40)],
+        [f"block_sparse_moe.output_linear.weight.{i}" for i in range(40)],
     ]
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
index 2c75eef5..e6b0c289 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
@@ -735,7 +735,7 @@ def replace_3d_parameters_with_module_list(
                         out_features=out_features,
                         device=device,
                         dtype=dtype,
-                        bias=None, # FIXME: how to support bias?
+                        bias=None,  # FIXME: how to support bias?
                     )
                     linear.weight.data = param.data[i]
                     module_list.append(linear)
@@ -744,4 +744,4 @@ def replace_3d_parameters_with_module_list(
 
                 # replace
                 delattr(module, param_name)
-                setattr(module, param_name, module_list)
\ No newline at end of file
+                setattr(module, param_name, module_list)