Fix overflow of padding/unpadding kernel (#2548)

adamantboy · ksivaman · web-flow · commit 697b52cbde6a · 2025-12-31T17:21:24.000+05:30
Signed-off-by: fuyue.lj &lt;fuyue.lj@antgroup.com&gt;
Co-authored-by: Kirthi Shankar Sivamani &lt;ksivamani@nvidia.com&gt;
diff --git a/transformer_engine/common/util/padding.cu b/transformer_engine/common/util/padding.cu
@@ -94,14 +94,15 @@ __global__ void __launch_bounds__(threads_per_block) multi_padding_kernel(MultiP
 #pragma unroll
     for (int i2 = 0; i2 < nvec; ++i2) {
       const int row = tile_row + i1 * nvec + i2;
+      size_t row_offset = static_cast<size_t>(row) * row_length;
       const int col = tile_col + j1 * nvec;
       Vec local_input;
       Vec local_output;
       local_input.clear();
       if (row < num_rows) {
         for (int j2 = 0; j2 < nvec; ++j2) {
           if (col + j2 < row_length) {
-            local_input.data.elt[j2] = input[row * row_length + col + j2];
+            local_input.data.elt[j2] = input[row_offset + col + j2];
           }
         }
       }
@@ -112,14 +113,14 @@ __global__ void __launch_bounds__(threads_per_block) multi_padding_kernel(MultiP
       if (row < num_rows) {
         for (int j2 = 0; j2 < nvec; ++j2) {
           if (col + j2 < row_length) {
-            output[row * row_length + col + j2] = local_output.data.elt[j2];
+            output[row_offset + col + j2] = local_output.data.elt[j2];
           }
         }
       } else if (row < padded_num_rows) {
         // padding
         for (int j2 = 0; j2 < nvec; ++j2) {
           if (col + j2 < row_length) {
-            output[row * row_length + col + j2] = local_zero;
+            output[row_offset + col + j2] = local_zero;
           }
         }
       }
@@ -178,14 +179,15 @@ __global__ void __launch_bounds__(threads_per_block) multi_unpadding_kernel(Mult
 #pragma unroll
     for (int i2 = 0; i2 < nvec; ++i2) {
       const int row = tile_row + i1 * nvec + i2;
+      size_t row_offset = static_cast<size_t>(row) * row_length;
       const int col = tile_col + j1 * nvec;
       Vec local_input;
       Vec local_output;
       local_input.clear();
       if (row < num_rows) {
         for (int j2 = 0; j2 < nvec; ++j2) {
           if (col + j2 < row_length) {
-            local_input.data.elt[j2] = input[row * row_length + col + j2];
+            local_input.data.elt[j2] = input[row_offset + col + j2];
           }
         }
       }
@@ -196,7 +198,7 @@ __global__ void __launch_bounds__(threads_per_block) multi_unpadding_kernel(Mult
       if (row < num_rows) {
         for (int j2 = 0; j2 < nvec; ++j2) {
           if (col + j2 < row_length) {
-            output[row * row_length + col + j2] = local_output.data.elt[j2];
+            output[row_offset + col + j2] = local_output.data.elt[j2];
           }
         }
       }

Original file line number	Diff line number	Diff line change
`@@ -94,14 +94,15 @@ __global__ void __launch_bounds__(threads_per_block) multi_padding_kernel(MultiP`
`94`	`94`	`#pragma unroll`
`95`	`95`	`for (int i2 = 0; i2 < nvec; ++i2) {`
`96`	`96`	`const int row = tile_row + i1 * nvec + i2;`
	`97`	`+ size_t row_offset = static_cast<size_t>(row) * row_length;`
`97`	`98`	`const int col = tile_col + j1 * nvec;`
`98`	`99`	`Vec local_input;`
`99`	`100`	`Vec local_output;`
`100`	`101`	`local_input.clear();`
`101`	`102`	`if (row < num_rows) {`
`102`	`103`	`for (int j2 = 0; j2 < nvec; ++j2) {`
`103`	`104`	`if (col + j2 < row_length) {`
`104`		`- local_input.data.elt[j2] = input[row * row_length + col + j2];`
	`105`	`+ local_input.data.elt[j2] = input[row_offset + col + j2];`
`105`	`106`	`}`
`106`	`107`	`}`
`107`	`108`	`}`
`@@ -112,14 +113,14 @@ __global__ void __launch_bounds__(threads_per_block) multi_padding_kernel(MultiP`
`112`	`113`	`if (row < num_rows) {`
`113`	`114`	`for (int j2 = 0; j2 < nvec; ++j2) {`
`114`	`115`	`if (col + j2 < row_length) {`
`115`		`- output[row * row_length + col + j2] = local_output.data.elt[j2];`
	`116`	`+ output[row_offset + col + j2] = local_output.data.elt[j2];`
`116`	`117`	`}`
`117`	`118`	`}`
`118`	`119`	`} else if (row < padded_num_rows) {`
`119`	`120`	`// padding`
`120`	`121`	`for (int j2 = 0; j2 < nvec; ++j2) {`
`121`	`122`	`if (col + j2 < row_length) {`
`122`		`- output[row * row_length + col + j2] = local_zero;`
	`123`	`+ output[row_offset + col + j2] = local_zero;`
`123`	`124`	`}`
`124`	`125`	`}`
`125`	`126`	`}`
`@@ -178,14 +179,15 @@ __global__ void __launch_bounds__(threads_per_block) multi_unpadding_kernel(Mult`
`178`	`179`	`#pragma unroll`
`179`	`180`	`for (int i2 = 0; i2 < nvec; ++i2) {`
`180`	`181`	`const int row = tile_row + i1 * nvec + i2;`
	`182`	`+ size_t row_offset = static_cast<size_t>(row) * row_length;`
`181`	`183`	`const int col = tile_col + j1 * nvec;`
`182`	`184`	`Vec local_input;`
`183`	`185`	`Vec local_output;`
`184`	`186`	`local_input.clear();`
`185`	`187`	`if (row < num_rows) {`
`186`	`188`	`for (int j2 = 0; j2 < nvec; ++j2) {`
`187`	`189`	`if (col + j2 < row_length) {`
`188`		`- local_input.data.elt[j2] = input[row * row_length + col + j2];`
	`190`	`+ local_input.data.elt[j2] = input[row_offset + col + j2];`
`189`	`191`	`}`
`190`	`192`	`}`
`191`	`193`	`}`
`@@ -196,7 +198,7 @@ __global__ void __launch_bounds__(threads_per_block) multi_unpadding_kernel(Mult`
`196`	`198`	`if (row < num_rows) {`
`197`	`199`	`for (int j2 = 0; j2 < nvec; ++j2) {`
`198`	`200`	`if (col + j2 < row_length) {`
`199`		`- output[row * row_length + col + j2] = local_output.data.elt[j2];`
	`201`	`+ output[row_offset + col + j2] = local_output.data.elt[j2];`
`200`	`202`	`}`
`201`	`203`	`}`
`202`	`204`	`}`