fix ce loss calculation when some tokens are ignored (#2476)

yashaswikarnati · pre-commit-ci[bot] · web-flow · commit 36f2dfd28a5a · 2025-12-15T10:21:31.000-08:00
* fix ce loss with ignore idx Signed-off-by: ykarnati <ykarnati@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: ykarnati <ykarnati@nvidia.com> * remove fix comments Signed-off-by: ykarnati <ykarnati@nvidia.com> * fallback divisor to 1 Signed-off-by: ykarnati <ykarnati@nvidia.com> * have arg for n_rows and n_non_ignore Signed-off-by: ykarnati <ykarnati@nvidia.com> * fuse n_non_ignore to softmax kernel Signed-off-by: ykarnati <ykarnati@nvidia.com> * fix incorrect arg Signed-off-by: ykarnati <ykarnati@nvidia.com> --------- Signed-off-by: ykarnati <ykarnati@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/tests/pytorch/test_parallel_cross_entropy.py b/tests/pytorch/test_parallel_cross_entropy.py
@@ -89,7 +89,7 @@ def one_iteration_test(
         # Check that loss and grad input match
         tols = dtype_tols(dtype)
         test_loss = test_loss.to(dtype=torch.float64, device="cpu")
-        ref_loss = test_loss.to(dtype=torch.float64, device="cpu")
+        ref_loss = ref_loss.to(dtype=torch.float64, device="cpu")
         ref_loss = ref_loss.reshape(test_loss.size())
         test_grad_input = self.input_test.grad.to(dtype=torch.float64, device="cpu")
         ref_grad_input = self.input_ref.grad.to(dtype=torch.float64, device="cpu")
@@ -154,3 +154,16 @@ def test_ignore_idx(self):
                 reduce_loss=False,
                 ignore_idx=True,
             )
+
+    def test_ignore_idx_reduced_loss(self):
+        """Test ignore_idx with reduce_loss=True"""
+        self.generate_iters(5)
+        self.generate_infra(True, 0)  # reduce_loss=True
+        for i in range(self.iters):
+            self.one_iteration_test(
+                dtype=torch.float32,
+                swap_dim=random.choice([True, False]),
+                label_smoothing=0,
+                reduce_loss=True,
+                ignore_idx=True,
+            )
diff --git a/transformer_engine/common/triton/cross_entropy.py b/transformer_engine/common/triton/cross_entropy.py
@@ -18,6 +18,8 @@ def online_softmax_kernel(
     m_d_X_y_stride,
     rank,
     n_cols,
+    ignore_idx,
+    n_non_ignore,
     BLOCK_SIZE: tl.constexpr,
 ):
     """
@@ -32,6 +34,8 @@ def online_softmax_kernel(
     m_d_X_y_stride (int): The stride of the m/d/X_y tensor.
     rank (int): The rank of this device in the TP group.
     n_cols (int): The number of columns in the input tensor.
+    ignore_idx (int): The index to ignore for loss calculation.
+    n_non_ignore: The number of non-ignored elements in the batch.
     BLOCK_SIZE (int): The block size for Triton operations.
     """
 
@@ -44,6 +48,9 @@ def online_softmax_kernel(
     Y_ptr += program_id * Y_stride
     y = tl.load(Y_ptr)
 
+    if y != ignore_idx:
+        tl.atomic_add(n_non_ignore, 1)
+
     vocab_start_idx = rank * n_cols
     vocab_end_idx = (rank + 1) * n_cols
     if y >= vocab_start_idx:
@@ -89,6 +96,7 @@ def cross_entropy_kernel(
     world_size,
     ignore_idx,
     n_cols,
+    n_rows,
     n_non_ignore,
     reduce_loss: tl.constexpr,
     label_smoothing: tl.constexpr,
@@ -110,12 +118,14 @@ def cross_entropy_kernel(
     world_size (int): The size of world involved in this distributed loss calculation.
     ignore_idx (int): Tokens to be ignored for loss and gradient calculation.
     n_cols (int): The number of columns in the input tensor.
-    n_non_ignore (int): The number of non-ignored elements in the batch.
+    n_rows (int): The number of rows in the batch (B * SQ), used for buffer indexing.
+    n_non_ignore: The number of non-ignored elements in the batch.
     label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
     BLOCK_SIZE (int): The block size for Triton operations.
     """
 
     program_id = tl.program_id(0).to(tl.int64)
+    n_non_ignore = tl.load(n_non_ignore)
 
     # locate the start index
     X_ptr += program_id * X_stride
@@ -140,7 +150,7 @@ def cross_entropy_kernel(
     ori_X_y = tl.load(m_d_X_y_ptr + (2 * m_d_X_y_stride))
 
     for i in range(1, world_size):
-        offset = i * 3 * n_non_ignore * m_d_X_y_stride
+        offset = i * 3 * n_rows * m_d_X_y_stride
         access_ptr = m_d_X_y_ptr + offset
         m_new = tl.load(access_ptr)
         d_new = tl.load(access_ptr + m_d_X_y_stride)
diff --git a/transformer_engine/pytorch/triton/cross_entropy.py b/transformer_engine/pytorch/triton/cross_entropy.py
@@ -46,6 +46,8 @@ def cross_entropy_forward(
     # tensor to hold this rank's m/d/X_y values
     m_d_X_y = torch.zeros(n_rows * 3, dtype=torch.float32, device=_input.device)
 
+    n_non_ignore = torch.zeros(1, dtype=torch.int64, device=_input.device)
+
     # ensure _input and target are contiguous in the last dimension
     if _input.stride(-1) != 1:
         _input = _input.contiguous()
@@ -63,10 +65,14 @@ def cross_entropy_forward(
         m_d_X_y_stride=m_d_X_y.stride(-1),
         rank=rank,
         n_cols=V,
+        ignore_idx=ignore_idx,
+        n_non_ignore=n_non_ignore,
         BLOCK_SIZE=BLOCK_SIZE,
         num_warps=32,
     )
 
+    n_non_ignore = torch.clamp(n_non_ignore, min=1)
+
     world_size = 1 if dist_process_group is None else dist.get_world_size(dist_process_group)
 
     if world_size > 1:
@@ -90,14 +96,17 @@ def cross_entropy_forward(
         world_size=world_size,
         ignore_idx=ignore_idx,
         n_cols=V,
-        n_non_ignore=n_rows,
+        n_rows=n_rows,
+        n_non_ignore=n_non_ignore,
         reduce_loss=reduce_loss,
         label_smoothing=label_smoothing,
         BLOCK_SIZE=BLOCK_SIZE,
         num_warps=32,
     )
 
-    loss = torch.reshape(loss_1d, (B, SQ)) if not reduce_loss else (torch.sum(loss_1d) / n_rows)
+    loss = (
+        torch.reshape(loss_1d, (B, SQ)) if not reduce_loss else (torch.sum(loss_1d) / n_non_ignore)
+    )
 
     return loss, _input