[PyTorch debug] Fix test for debug tools (#2507)

pggPL · timmoon10 · web-flow · commit 2886cbce11ae · 2025-12-15T15:33:37.000-08:00
* Skip delayed wgrad tests in distributed numerics when debug mode is enabled

Signed-off-by: Pawel Gadzinski &lt;pgadzinski@nvidia.com&gt;

* fix

Signed-off-by: Pawel Gadzinski &lt;pgadzinski@nvidia.com&gt;

---------

Signed-off-by: Pawel Gadzinski &lt;pgadzinski@nvidia.com&gt;
Co-authored-by: Tim Moon &lt;4406448+timmoon10@users.noreply.github.com&gt;
diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -44,7 +44,7 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_
 
 pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_distributed.xml $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py"
 # standard numerics tests with initialized debug
-NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "debug test_numerics.py"
+NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "debug test_numerics.py"
 
 if [ "$RET" -ne 0 ]; then
     echo "Error in the following test cases:$FAILED_CASES"
diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py
@@ -38,8 +38,9 @@
 NCCL_WORLD = None
 LOSS_FN = nn.MSELoss()
 QUANTIZATION = None
+NVTE_TEST_NVINSPECT_ENABLED = int(os.environ.get("NVTE_TEST_NVINSPECT_ENABLED") or "0")
 
-if os.environ.get("NVTE_TEST_NVINSPECT_ENABLED", False):
+if NVTE_TEST_NVINSPECT_ENABLED:
     # The numerics of all the layers should work the same,
     # when debug=True. I fed them with dummy feature
     # to prevent switching off debug, which can happen if
@@ -745,6 +746,8 @@ def test_linear():
     for kwargs in kwargs_list:
         if kwargs.get("save_original_input", False) and QUANTIZATION == "fp8":
             continue
+        if kwargs.get("delay_wgrad_compute", False) and NVTE_TEST_NVINSPECT_ENABLED:
+            continue
         for parallel_mode in ["column", "row"]:
             for sequence_parallel in [False, True]:
                 _test_linear(parallel_mode, sequence_parallel, **kwargs)
@@ -924,6 +927,8 @@ def test_layernorm_linear():
     ]
 
     for kwargs in kwargs_list:
+        if kwargs.get("delay_wgrad_compute", False) and NVTE_TEST_NVINSPECT_ENABLED:
+            continue
         for parallel_mode in ["column"]:
             for sequence_parallel in [False, True]:
                 _test_layernorm_linear(parallel_mode, sequence_parallel, **kwargs)
@@ -1034,6 +1039,8 @@ def test_layernorm_mlp():
     ]
 
     for kwargs in kwargs_list:
+        if kwargs.get("delay_wgrad_compute", False) and NVTE_TEST_NVINSPECT_ENABLED:
+            continue
         for set_parallel_mode in [True]:
             for sequence_parallel in [False, True]:
                 _test_layernorm_mlp(set_parallel_mode, sequence_parallel, **kwargs)