pytorch
diff --git a/‎torchtitan/experiments/graph_trainer/configs.py‎
Lines changed: 17 additions & 6 deletions b/‎torchtitan/experiments/graph_trainer/configs.py‎
Lines changed: 17 additions & 6 deletions
@@ -44,14 +44,17 @@ class GraphTrainerCompileConfig(CompileConfig):
     debug_graph_passes: bool = False
     """Log timing, op-count diffs, and before/after graphs for each pass to tlparse."""
 
-    memory_policy: Literal["default", "eager", "cpu_offload_all"] = "default"
+    memory_policy: Literal[
+        "default", "eager", "default_offload", "cpu_offload_all"
+    ] = "default"
     """
     Memory optimization policy for activation management (SAC, offload).
-        default: save all compute-intensive ops and FSDP all_gathers.
-        eager: alternate mm ops between save/recompute, matching the eager
-            AC policy in torchtitan.distributed.activation_checkpoint.
-        cpu_offload_all: offload all eligible activations to CPU.
-            Work in progress — for development and testing only.
+        default: SAC — save all compute-intensive ops and FSDP all_gathers.
+        eager: SAC alternating mm ops between save/recompute, matching the
+            eager AC policy in torchtitan.distributed.activation_checkpoint.
+        default_offload: SAC + CPU offload — apply default SAC first, then
+            offload surviving MUST_SAVE activations to CPU.
+        cpu_offload_all: offload all eligible activations to CPU (no SAC).
     """
 
     inductor_compilation: Literal["regional", "full"] = "regional"
@@ -67,6 +70,14 @@ class GraphTrainerCompileConfig(CompileConfig):
     enable_cudagraph: bool = True
     """When False, skip the cudagraph pass even if the graph is compatible."""
 
+    cpu_offload_prefetch_n_layers: int = 1
+    """Prefetch reloads this many layers ahead in the backward graph
+    to overlap H2D transfers with compute."""
+
+    cpu_offload_budget_gb: float = 100.0
+    """Maximum CPU memory budget (in GB per rank) for offloaded activations.
+    Tensors are selected largest-first until the budget is exhausted."""
+
     precompile_artifact_dir: str = ""
     """
     Directory for precompiled artifacts. Setting this enables precompile: