@@ -44,14 +44,17 @@ class GraphTrainerCompileConfig(CompileConfig):
4444 debug_graph_passes : bool = False
4545 """Log timing, op-count diffs, and before/after graphs for each pass to tlparse."""
4646
47- memory_policy : Literal ["default" , "eager" , "cpu_offload_all" ] = "default"
47+ memory_policy : Literal [
48+ "default" , "eager" , "default_offload" , "cpu_offload_all"
49+ ] = "default"
4850 """
4951 Memory optimization policy for activation management (SAC, offload).
50- default: save all compute-intensive ops and FSDP all_gathers.
51- eager: alternate mm ops between save/recompute, matching the eager
52- AC policy in torchtitan.distributed.activation_checkpoint.
53- cpu_offload_all: offload all eligible activations to CPU.
54- Work in progress — for development and testing only.
52+ default: SAC — save all compute-intensive ops and FSDP all_gathers.
53+ eager: SAC alternating mm ops between save/recompute, matching the
54+ eager AC policy in torchtitan.distributed.activation_checkpoint.
55+ default_offload: SAC + CPU offload — apply default SAC first, then
56+ offload surviving MUST_SAVE activations to CPU.
57+ cpu_offload_all: offload all eligible activations to CPU (no SAC).
5558 """
5659
5760 inductor_compilation : Literal ["regional" , "full" ] = "regional"
@@ -67,6 +70,14 @@ class GraphTrainerCompileConfig(CompileConfig):
6770 enable_cudagraph : bool = True
6871 """When False, skip the cudagraph pass even if the graph is compatible."""
6972
73+ cpu_offload_prefetch_n_layers : int = 1
74+ """Prefetch reloads this many layers ahead in the backward graph
75+ to overlap H2D transfers with compute."""
76+
77+ cpu_offload_budget_gb : float = 100.0
78+ """Maximum CPU memory budget (in GB per rank) for offloaded activations.
79+ Tensors are selected largest-first until the budget is exhausted."""
80+
7081 precompile_artifact_dir : str = ""
7182 """
7283 Directory for precompiled artifacts. Setting this enables precompile:
0 commit comments