From 41b64cdcb03b120684d7b6ea1a6d5a43d200254f Mon Sep 17 00:00:00 2001 From: Bo Li Date: Sat, 7 Feb 2026 19:17:46 +0800 Subject: [PATCH 1/2] perf: use zero_grad(set_to_none=True) to reduce memory bandwidth --- training/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/train.py b/training/train.py index d3139f77..873b5fcf 100644 --- a/training/train.py +++ b/training/train.py @@ -664,7 +664,7 @@ def wrap_ddp(model): for pfc in list_module_pfc: clip_grad_norm_(pfc.parameters(), max_norm=5, norm_type=2) opt.step() - opt.zero_grad() + opt.zero_grad(set_to_none=True) lr_scheduler.step() From e2be8ff591751d8b1c61914851d208a66d667b13 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Sat, 7 Feb 2026 19:18:03 +0800 Subject: [PATCH 2/2] perf: only save HuggingFace checkpoint at final step to reduce I/O stalls --- training/train.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/training/train.py b/training/train.py index 873b5fcf..c8e66c28 100644 --- a/training/train.py +++ b/training/train.py @@ -692,8 +692,6 @@ def wrap_ddp(model): list_head_names=args.list_head_names, keep_num=20, ) - # Also save in HuggingFace format - save_hf_checkpoint(args.output, backbone, global_step=global_step, image_size=args.image_size[0]) if global_step > args.total_steps: save_checkpoint(