Skip to content

Commit 665a906

Browse files
committed
[cpu-offloading] Implement prefetching via env var configs
ghstack-source-id: b064bf3 Pull Request resolved: #3166
1 parent 533795e commit 665a906

5 files changed

Lines changed: 546 additions & 69 deletions

File tree

torchtitan/experiments/graph_trainer/configs.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,17 @@ class GraphTrainerCompileConfig(CompileConfig):
4444
debug_graph_passes: bool = False
4545
"""Log timing, op-count diffs, and before/after graphs for each pass to tlparse."""
4646

47-
memory_policy: Literal["default", "eager", "cpu_offload_all"] = "default"
47+
memory_policy: Literal[
48+
"default", "eager", "default_offload", "cpu_offload_all"
49+
] = "default"
4850
"""
4951
Memory optimization policy for activation management (SAC, offload).
50-
default: save all compute-intensive ops and FSDP all_gathers.
51-
eager: alternate mm ops between save/recompute, matching the eager
52-
AC policy in torchtitan.distributed.activation_checkpoint.
53-
cpu_offload_all: offload all eligible activations to CPU.
54-
Work in progress — for development and testing only.
52+
default: SAC — save all compute-intensive ops and FSDP all_gathers.
53+
eager: SAC alternating mm ops between save/recompute, matching the
54+
eager AC policy in torchtitan.distributed.activation_checkpoint.
55+
default_offload: SAC + CPU offload — apply default SAC first, then
56+
offload surviving MUST_SAVE activations to CPU.
57+
cpu_offload_all: offload all eligible activations to CPU (no SAC).
5558
"""
5659

5760
inductor_compilation: Literal["regional", "full"] = "regional"
@@ -67,6 +70,14 @@ class GraphTrainerCompileConfig(CompileConfig):
6770
enable_cudagraph: bool = True
6871
"""When False, skip the cudagraph pass even if the graph is compatible."""
6972

73+
cpu_offload_prefetch_n_layers: int = 1
74+
"""Prefetch reloads this many layers ahead in the backward graph
75+
to overlap H2D transfers with compute."""
76+
77+
cpu_offload_budget_gb: float = 100.0
78+
"""Maximum CPU memory budget (in GB per rank) for offloaded activations.
79+
Tensors are selected largest-first until the budget is exhausted."""
80+
7081
precompile_artifact_dir: str = ""
7182
"""
7283
Directory for precompiled artifacts. Setting this enables precompile:

0 commit comments

Comments
 (0)