foundation-model-stack · chichun-charlie-liu · Oct 17, 2025 · Oct 14, 2025 · Oct 15, 2025 · Oct 15, 2025
@@ -13,9 +13,6 @@
 # limitations under the License.
 """Torch registration of FP8xFP8 operation for attention BMMs."""
 
-# Standard
-from typing import Optional
-
 # Third Party
 from torch import Tensor
 import torch
@@ -29,61 +26,6 @@
 # open issue in PyLint: https://github.com/pytorch/pytorch/issues/119482
 
 
-def _scaled_mm_cpu_out(
-    mat1: Tensor,
-    mat2: Tensor,
-    scale1: Tensor,
-    scale2: Tensor,
-    bias: Optional[Tensor] = None,
-    scale_result: Optional[Tensor] = None,
-    out_dtype: Optional[torch.dtype] = None,
-    use_fast_accum: bool = False,
-    *,
-    out: Optional[Tensor] = None,
-) -> Tensor:
-    if out_dtype is None:
-        out_dtype = torch.float32
-    mat1 = (mat1.to(dtype=out_dtype) * scale1).to(dtype=out_dtype)
-    mat2 = (mat2.to(dtype=out_dtype) * scale2).to(dtype=out_dtype)
-
-    if bias is not None:
-        ret = torch.addmm(bias, mat1, mat2).to(dtype=out_dtype)
-    else:
-        ret = torch.mm(mat1, mat2).to(dtype=out_dtype)
-
-    if out is not None:
-        out.copy_(ret)
-        return out
-    return ret
-
-
-torch.library.register_kernel(torch.ops.aten._scaled_mm.out, "cpu", _scaled_mm_cpu_out)
-
-
-@torch.library.register_kernel("aten::_scaled_mm", "cpu")
-def _scaled_mm_cpu(
-    mat1: Tensor,
-    mat2: Tensor,
-    scale1: Tensor,
-    scale2: Tensor,
-    bias: Optional[Tensor] = None,
-    scale_result: Optional[Tensor] = None,
-    out_dtype: Optional[torch.dtype] = None,
-    use_fast_accum: bool = False,
-) -> Tensor:
-    return _scaled_mm_cpu_out(
-        mat1,
-        mat2,
-        scale1,
-        scale2,
-        bias,
-        scale_result,
-        out_dtype,
-        use_fast_accum,
-        out=None,
-    )
-
-
 @torch.library.custom_op("spyre::scaled_bmm", mutates_args=())
 def spyre_scaled_bmm(
     mat1: Tensor,

@@ -51,9 +51,9 @@ def test_fp8_op() -> None:
     # Local
     from fms_mo.aiu_addons.fp8.fp8_attn import _math_fp8_compute_op
 
-    query = torch.randn((1, 32, 64, 128), dtype=torch.bfloat16, device="cuda")
-    key = torch.randn((1, 32, 64, 128), dtype=torch.bfloat16, device="cuda")
-    value = torch.randn((1, 32, 64, 128), dtype=torch.bfloat16, device="cuda")
+    query = torch.randn((1, 64, 32, 128), dtype=torch.bfloat16, device="cuda")
+    key = torch.randn((1, 64, 32, 128), dtype=torch.bfloat16, device="cuda")
+    value = torch.randn((1, 64, 32, 128), dtype=torch.bfloat16, device="cuda")
 
     out = _math_fp8_compute_op(query, key, value, 32, 32, 0.0, None)
     assert out.size() == query.size()