[Misc] Rename offload_sequential_activations to sequential_offload_device (#2134)

kylesayrs · web-flow · commit 58e50e049f34 · 2025-12-16T17:23:44.000Z
## Purpose ##
* Enable users to offload activations to another GPU
* Because GPU to GPU transfer is must faster than GPU to CPU, there
should theoretically be runtime improvements from this option

## Changes ##
* Rename `offload_sequential_activations` -&gt; `sequential_offload_device`

## TODO ##
* Demonstrate in test that using `cuda:1` leads to runtime improvements

---------

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
@@ -230,12 +230,12 @@ class DatasetArguments(CustomDatasetArguments):
             "definition"
         },
     )
-    offload_sequential_activations: bool = field(
-        default=True,
+    sequential_offload_device: str = field(
+        default="cpu",
         metadata={
-            "help": "Whether to offload intermediate activations between sequential "
-            "layers to the CPU. Disabling offloading is much faster, but uses "
-            "signficiantly more memory. Default is True."
+            "help": "Device used to offload intermediate activations between "
+            "sequential layers. It is recommended to use `cuda:1` if using more "
+            "than one gpu. Default is cpu."
         },
     )
     quantization_aware_calibration: bool = field(
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -89,8 +89,7 @@ def __call__(
                 stack.enter_context(DisableQuantization(model))
 
             # prepare intermediates cache
-            cache_offload = dataset_args.offload_sequential_activations
-            offload_device = torch.device("cpu") if cache_offload else None
+            offload_device = torch.device(dataset_args.sequential_offload_device)
             activations = IntermediatesCache.from_dataloader(
                 dataloader, model_device, offload_device=offload_device
             )

Original file line number	Diff line number	Diff line change
`@@ -89,8 +89,7 @@ def __call__(`
`89`	`89`	`stack.enter_context(DisableQuantization(model))`
`90`	`90`
`91`	`91`	`# prepare intermediates cache`
`92`		`- cache_offload = dataset_args.offload_sequential_activations`
`93`		`- offload_device = torch.device("cpu") if cache_offload else None`
	`92`	`+ offload_device = torch.device(dataset_args.sequential_offload_device)`
`94`	`93`	`activations = IntermediatesCache.from_dataloader(`
`95`	`94`	`dataloader, model_device, offload_device=offload_device`
`96`	`95`	`)`