From 31c4e2f3d067f06487f87b6d9a4aa2788d722acb Mon Sep 17 00:00:00 2001
From: Iqbal Saraf <iqbal.saraf@ibm.com>
Date: Mon, 16 Jun 2025 17:29:19 +0000
Subject: [PATCH 1/3] Add QmaxDynamic to allow unify Qmax , Qminmax,
 pertokenmax

Signed-off-by: Iqbal Saraf <iqbal.saraf@ibm.com>
---
 fms_mo/quant/quantizers.py | 70 ++++++++++++++++++++++++++++++--------
 1 file changed, 55 insertions(+), 15 deletions(-)

diff --git a/fms_mo/quant/quantizers.py b/fms_mo/quant/quantizers.py
index 0d77f8b8..4e3220b1 100644
--- a/fms_mo/quant/quantizers.py
+++ b/fms_mo/quant/quantizers.py
@@ -123,12 +123,24 @@ def get_activation_quantizer(
             )
         elif qa_mode == "dorefa":
             act_quantizer = dorefa_quantize_activation
-        elif (
-            qa_mode == "max"
-        ):  # NOTE Need to be careful using this for activation, particular to 1 sided.
-            act_quantizer = Qmax(nbits, align_zero=align_zero, minmax=False)
-        elif qa_mode == "minmax":
-            act_quantizer = Qmax(nbits, align_zero=align_zero, minmax=True)
+
+        elif "max" in qa_mode:
+            # NOTE Need to be careful using this for activation, particular to 1 sided.
+            if "min" in qa_mode:
+                act_quantizer = Qmax(nbits, align_zero=align_zero, minmax=True)
+            elif "pertoken" in qa_mode or "perToken" in qa_mode:
+                act_quantizer = QMaxDynamic(nbits, dim=-1)
+            elif "per_channel" in qa_mode or "perCh" in qa_mode:
+                act_quantizer = QMaxDynamic(nbits, dim=-2)
+            elif "sym" in qa_mode:
+                act_quantizer = Qmax(
+                    nbits,
+                    align_zero=True,
+                    minmax=False,
+                    extend_act_range=extend_act_range,
+                )
+            else:
+                act_quantizer = Qmax(nbits, align_zero=align_zero, minmax=False)
         elif qa_mode == "fix":
             act_quantizer = QFixSymmetric(
                 nbits, init_clip_val=clip_val, align_zero=align_zero
@@ -140,13 +152,7 @@ def get_activation_quantizer(
                 minmax=False,
                 extend_act_range=extend_act_range,
             )
-        elif qa_mode == "pactsym":
-            act_quantizer = PACT2Sym(
-                nbits,
-                init_clip_val=clip_val,
-                dequantize=True,
-                inplace=False,
-            )
+
         elif qa_mode == "pactsym+":
             act_quantizer = PACTplusSym(
                 nbits,
@@ -179,8 +185,6 @@ def get_activation_quantizer(
                     perToken=perToken,
                     emulate=True,
                 )
-        elif qa_mode == "pertokenmax":
-            act_quantizer = PerTokenMax(nbits)
         else:
             raise ValueError(f"unrecognized activation quantization mode {qa_mode}")
     else:  # swcap-compatible activation quantizers
@@ -3488,6 +3492,42 @@ def __repr__(self):
         return f"{self.__class__.__name__}(num_bits={self.num_bits}, quantizer=)"
 
 
+class QMaxDynamic(nn.Module):
+    def __init__(self, num_bits, dim=-1):
+        """
+        For per-token or per-channel quantization using abs().max() as scale, usually for activation
+        and could be used for Qbmm M2 as well.
+        (reduce) dim = -1 -> abs() will output a column vector (if input is 2D) => per token
+                 dim = -2 -> per-channel
+        Zero is aligned so that the levels are symmetric around zero (lossing one level)
+        Since the token length is un-known before running, the quantizater can only calculate the
+        scales at the run times dynamically, meaning no trainable quantization scales is allowed.
+        (unless input seq length is always the same, not just padded to a fixed length.)
+        """
+        super().__init__()
+        self.num_bits = num_bits
+        self.levels = 2 ** (self.num_bits - 1) - 1
+        if isinstance(dim, str):
+            if "perCh" in dim or "per_channel" in dim:
+                dim = -2
+            elif "perToken" in dim or "per_token" in dim or "per_Token" in dim:
+                dim = -1
+        elif dim in [-1, -2]:
+            self.reduce_dim = dim
+        else:
+            raise ValueError(
+                f"Reduce dim can only be [-1, -2] or ['perCh', 'perToken'] but found {dim}"
+            )
+
+    def forward(self, input_tensor):
+        amax_dim = input_tensor.abs().max(dim=self.reduce_dim, keepdim=True)[0]
+        scales = amax_dim.clamp(min=1e-5).div(self.levels)
+        return input_tensor.div(scales).round().mul(scales)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}(num_bits={self.num_bits}, quantizer=)"
+
+
 class Qdynamic(nn.Module):
     def __init__(
         self,

From 22a81bfc51183472d219def3c84d480332d47775 Mon Sep 17 00:00:00 2001
From: Iqbal Saraf <iqbal.saraf@ibm.com>
Date: Wed, 16 Jul 2025 19:24:50 +0000
Subject: [PATCH 2/3] added pactsym missing code

---
 fms_mo/quant/quantizers.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fms_mo/quant/quantizers.py b/fms_mo/quant/quantizers.py
index 4e3220b1..3af4f45b 100644
--- a/fms_mo/quant/quantizers.py
+++ b/fms_mo/quant/quantizers.py
@@ -152,7 +152,13 @@ def get_activation_quantizer(
                 minmax=False,
                 extend_act_range=extend_act_range,
             )
-
+        elif qa_mode == "pactsym":
+            act_quantizer = PACT2Sym(
+                nbits,
+                init_clip_val=clip_val,
+                dequantize=True,
+                inplace=False,
+            )
         elif qa_mode == "pactsym+":
             act_quantizer = PACTplusSym(
                 nbits,

From 343abc769e6664399fee3d1b6eeb0e425595faeb Mon Sep 17 00:00:00 2001
From: Iqbal Saraf <iqbal.saraf@ibm.com>
Date: Wed, 16 Jul 2025 19:55:09 +0000
Subject: [PATCH 3/3] maxsym @ lines 148 removed

---
 fms_mo/quant/quantizers.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/fms_mo/quant/quantizers.py b/fms_mo/quant/quantizers.py
index 3af4f45b..4b61eb91 100644
--- a/fms_mo/quant/quantizers.py
+++ b/fms_mo/quant/quantizers.py
@@ -145,13 +145,6 @@ def get_activation_quantizer(
             act_quantizer = QFixSymmetric(
                 nbits, init_clip_val=clip_val, align_zero=align_zero
             )
-        elif qa_mode == "maxsym":
-            act_quantizer = Qmax(
-                nbits,
-                align_zero=True,
-                minmax=False,
-                extend_act_range=extend_act_range,
-            )
         elif qa_mode == "pactsym":
             act_quantizer = PACT2Sym(
                 nbits,