greptile fixes

denera · denera · commit d79bf2163748 · 2025-12-17T02:16:02.000Z
Signed-off-by: Alp Dener &lt;adener@nvidia.com&gt;
diff --git a/tests/pytorch/distributed/run_gemm_with_overlap.py b/tests/pytorch/distributed/run_gemm_with_overlap.py
@@ -408,7 +408,6 @@ def dist_print(msg, src=None, info=False, error=False, section=False, group=None
         if opts.comm_type == tex.CommOverlapType.AG:
             # (M/P, N) -> overlapped AG -> (M, N) x (K/P, N)^T = (M, K/P)
             local_kernel_t_shape = (ffn_hidden_size // tp_size, hidden_size)
-            local_kernel2_t_shape = (0, )
             local_inp_shape = (outer_size // tp_size, hidden_size)
             if ub_obj2 is not None:
                 local_kernel2_t_shape = (hidden_size, ffn_hidden_size // tp_size)
@@ -479,7 +478,6 @@ def dist_print(msg, src=None, info=False, error=False, section=False, group=None
             ref_g = torch.stack(bulk_inp_list).sum(dim=0)
     else:
         ref_g = torch.matmul(inp_g, ker_g)
-        ref2_g = (0, )
         if ub_obj2 is not None:
             inp2_g = torch.nn.functional.gelu(ref_g)  # pylint: disable=not-callable
             ref2_g = torch.matmul(inp2_g, ker2_g)
diff --git a/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp b/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp
@@ -332,7 +332,7 @@ void CommOverlapCore::cublasmp_gemm_rs(const TensorWrapper &A, bool transa, cons
   int64_t m = transa ? A.size(0) : A.size(1);
   int64_t n = transb ? B.size(1) : B.size(0);
   int64_t k_local = transa ? A.size(1) : A.size(0);
-  int64_t k = k * _tp_size;
+  int64_t k = k_local * _tp_size;
 
   nvte_gemm_reduce_scatter(_cublasmp_ctx, m, n, k, A.data(), B.data(), D.data(), bias.data(),
                            pre_gelu_out.data(), transa, transb, grad, accumulate, _num_comm_sm,
diff --git a/transformer_engine/jax/csrc/extensions/cgemm_helper.cpp b/transformer_engine/jax/csrc/extensions/cgemm_helper.cpp
@@ -132,7 +132,7 @@ void CommunicatorHandler::init(int num_total_devices, int num_devices_per_proces
   NVTE_CHECK_NCCL(ncclGroupEnd());
 
   // Allocate device memory for barrier operations
-  NVTE_CHECK_CUDA(cudaMalloc(&reinterpret_cast<int>(handler._device_barrier), sizeof(int)));
+  NVTE_CHECK_CUDA(cudaMalloc(&handler._device_barrier, sizeof(int)));
 
   handler._initialize = true;
 
@@ -195,8 +195,9 @@ CommOverlapCore *CollectiveGemmPlanRegistry::get_executor(std::vector<size_t> bu
   std::unique_ptr<CommOverlapCore> executor;
   if (use_cublasmp) {
     executor = std::make_unique<CommOverlapP2PBase>(
-        reinterpret_cast<int64_t>(comm_handler.get_comm_for_current_device()), comm_handler.tp_size,
-        comm_handler.get_tp_domain_id(), cgemm_config.num_comm_sm, cgemm_config.aggregate_ag);
+        reinterpret_cast<int64_t>(comm_handler.get_comm_for_current_device()),
+        comm_handler.get_tp_domain_id(), comm_handler.tp_size, cgemm_config.num_comm_sm,
+        cgemm_config.aggregate_ag);
   } else {
     executor = std::make_unique<CommOverlapP2PBase>(
         buffer_shape, dtype, comm_handler.get_global_rank(), comm_handler.num_total_devices,
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
@@ -525,13 +525,19 @@ class CommOverlapHelper : torch::CustomClassHolder {
   void ub_allgather(void *globaldata, size_t globalbytes, void *localdata, size_t localbytes,
                     ExtComm comm);
 
-  void ub_barrier(ExtComm comm);a
+  void ub_barrier(ExtComm comm);
 
   int64_t get_nccl_comm_ptr(std::string comm_name) {
+#ifdef USE_C10_NCCL
     NVTE_CHECK(backend_is_nccl,
                "Comm+GEMM overlap with cuBLASMp backend requires a tensor-parallel process ",
                "group with NCCL backend.");
-    return reinterpret_cast<c10d::ProcessGroupNCCL *>(pgs[comm_name])->getCommPtr();
+    c10d::ProcessGroupNCCL *nccl_pg = reinterpret_cast<c10d::ProcessGroupNCCL *>(pgs[comm_name]);
+    return nccl_pg->getCommPtr();
+#else
+    NVTE_ERROR("Internal TE Error: CommOverlapHelper::get_nccl_comm_ptr() is an internal API that ",
+               "should only be used when TE is built with the NVTE_WITH_CUBLASMP=1 flag.");
+#endif
   }
 };
 
@@ -542,11 +548,11 @@ class CommOverlap : torch::CustomClassHolder, public transformer_engine::CommOve
               int num_max_streams = NVTE_COMM_OVERLAP_MAX_STREAMS, int comm_cga_size = 2,
               int gemm_priority = 0, int comm_priority = 0, int num_comm_sm = 16,
               bool set_sm_margin = true, bool atomic_gemm = false,
-              bool rs_overlap_first_gemm = false);
+              bool rs_overlap_first_gemm= false);
 
-  CommOverlap(CommOverlapHelper *helper, int tp_size, int tp_rank, int num_comm_sm = 16,
+  CommOverlap(CommOverlapHelper *helper, int tp_rank, int tp_size, int num_comm_sm = 16,
               bool atomic_gemm = false)
-      : CommOverlapBase(helper->get_nccl_comm_ptr("intra"), tp_size, tp_rank, num_comm_sm,
+      : CommOverlapBase(helper->get_nccl_comm_ptr("intra"), tp_rank, tp_size, num_comm_sm,
                         atomic_gemm) {}
 
   ~CommOverlap() {}
@@ -570,9 +576,9 @@ class CommOverlapP2P : torch::CustomClassHolder, public transformer_engine::Comm
                  bool set_sm_margin = true, bool atomic_gemm = false, bool use_ce = true,
                  bool aggregate = false);
 
-  CommOverlapP2P(CommOverlapHelper *helper, int tp_size, int tp_rank, int num_comm_sm = 16,
+  CommOverlapP2P(CommOverlapHelper *helper, int tp_rank, int tp_size, int num_comm_sm = 16,
                  bool atomic_gemm = false)
-      : CommOverlapP2PBase(helper->get_nccl_comm_ptr("intra"), tp_size, tp_rank, num_comm_sm,
+      : CommOverlapP2PBase(helper->get_nccl_comm_ptr("intra"), tp_rank, tp_size, num_comm_sm,
                            atomic_gemm) {}
 
   ~CommOverlapP2P() {}
diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
@@ -491,7 +491,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
            py::arg("num_comm_sm") = 16, py::arg("set_sm_margin") = true,
            py::arg("atomic_gemm") = false, py::arg("rs_overlap_first_gemm") = false)
       .def(py::init<CommOverlapHelper *, int, int, int, bool>(), py::arg("helper"),
-           py::arg("tp_size"), py::arg("tp_rank"), py::arg("num_comm_sm") = 0,
+           py::arg("tp_rank"), py::arg("tp_size"), py::arg("num_comm_sm") = 0,
            py::arg("atomic_gemm") = false, py::call_guard<py::gil_scoped_release>())
       .def("copy_into_buffer", &CommOverlap::copy_into_buffer, py::arg("input"),
            py::arg("local_chunk") = false)
@@ -512,7 +512,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
            py::arg("set_sm_margin") = false, py::arg("atomic_gemm") = false,
            py::arg("use_ce") = true, py::arg("aggregate") = false)
       .def(py::init<CommOverlapHelper *, int, int, int, bool>(), py::arg("helper"),
-           py::arg("tp_size"), py::arg("tp_rank"), py::arg("num_comm_sm") = 0,
+           py::arg("tp_rank"), py::arg("tp_size"), py::arg("num_comm_sm") = 0,
            py::arg("atomic_gemm") = false, py::call_guard<py::gil_scoped_release>())
       .def("copy_into_buffer", &CommOverlapP2P::copy_into_buffer, py::arg("input"),
            py::arg("local_chunk") = false)