MetaMathQA: upload checkpoint to bucket after training (#3163)

githubnemo · web-flow · commit 5b6e5e993944 · 2026-04-15T19:12:54.000+02:00
To do post-training analysis on the checkpoints (e.g., when
experimenting on different post-training metrics) it is useful
to have the checkpoints available. Since we have buckets now,
let's use those.

For the Makefile you need to set the `UPLOAD_BUCKET` environment
variable to activate this feature for the whole run.
diff --git a/method_comparison/MetaMathQA/Makefile b/method_comparison/MetaMathQA/Makefile
@@ -6,6 +6,12 @@ RUN_SCRIPT := run.py
 EXPERIMENTS_DIR := experiments
 RESULTS_DIR := results
 
+OPTIONAL_FLAGS =
+
+ifdef UPLOAD_BUCKET
+	OPTIONAL_FLAGS += --bucket_name "${UPLOAD_BUCKET}"
+endif
+
 # --- Automatic Experiment and Result Discovery ---
 
 # 1. Find all experiment directories by looking for adapter_config.json files.
@@ -49,7 +55,7 @@ define EXPERIMENT_template
 $(call exp_to_res,$(1)): $(wildcard $(1)/adapter_config.json) $(wildcard $(1)/training_params.json)
 	@echo "---"
 	@echo "Running experiment: $(1)"
-	-$(PYTHON) $(RUN_SCRIPT) -v $(1)
+	-$(PYTHON) $(RUN_SCRIPT) $(OPTIONAL_FLAGS) -v $(1)
 	@echo "Finished: $$@"
 	@echo "---"
 
diff --git a/method_comparison/MetaMathQA/README.md b/method_comparison/MetaMathQA/README.md
@@ -47,6 +47,10 @@ without modifying it. For example:
 
 to run the VBLoRA default experiment again.
 
+If you set `UPLOAD_BUCKET="your_user/bucket_name"` as an environment variable prior to starting experiments
+via `make`, all experiments will be called with the `--bucket_name $UPLOAD_BUCKET` parameter and therefore
+store the checkpoints in that bucket.
+
 ### `adapter_config.json`
 
 This must be a valid PEFT configuration. It is easiest to create it programmatically, e.g.:
@@ -94,7 +98,7 @@ From practical experiments, for a batch size of 4, a bucket size of 80 provides
 
 ### Start a run
 
-Once everything is set up properly, start a run by using the `run.py` script. Pass `-v` for verbose output to the console (recommended if observing the progress is desired). As an example, for `experiments/lora/llama-3.2-3B-rank32/` the invocation would be:
+Once everything is set up properly, start a run by using the `run.py` script. Pass `-v` for verbose output to the console (recommended if observing the progress is desired). To save the resulting experiment checkpoints to a huggingface bucket, you can pass the bucket name via the `--bucket_name` parameter (e.g., `"user/my_bucket_name"`). As an example, for `experiments/lora/llama-3.2-3B-rank32/` the invocation would be:
 
 ```sh
 python run.py -v experiments/lora/llama-3.2-3B-rank32/
diff --git a/method_comparison/MetaMathQA/run.py b/method_comparison/MetaMathQA/run.py
@@ -50,6 +50,7 @@
     get_train_config,
     init_accelerator,
     log_results,
+    upload_checkpoint_to_bucket,
     validate_experiment_path,
 )
 
@@ -405,7 +406,7 @@ def train(
     return train_result
 
 
-def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None:
+def main(*, path_experiment: str, experiment_name: str, clean: bool, bucket_name: Optional[str]) -> None:
     tic_total = time.perf_counter()
     start_date = dt.datetime.now(tz=dt.timezone.utc).replace(microsecond=0).isoformat()
 
@@ -477,6 +478,9 @@ def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None:
         print_fn=print_verbose,
     )
 
+    if bucket_name is not None:
+        upload_checkpoint_to_bucket(model, experiment_name, bucket_name)
+
     time_total = time.perf_counter() - tic_total
     # log results: print and save to file
     log_results(
@@ -503,6 +507,7 @@ def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None:
         action="store_true",
         help="Delete training artifacts after run finishes (logs are still saved)",
     )
+    parser.add_argument("--bucket_name", type=str, help="HF bucket to upload checkpoints to.")
     args = parser.parse_args()
 
     experiment_name = validate_experiment_path(args.path_experiment)
@@ -521,4 +526,5 @@ def print_verbose(*args, **kwargs) -> None:
         path_experiment=args.path_experiment,
         experiment_name=experiment_name,
         clean=args.clean,
+        bucket_name=args.bucket_name,
     )
diff --git a/method_comparison/MetaMathQA/utils.py b/method_comparison/MetaMathQA/utils.py
@@ -344,6 +344,18 @@ def __iter__(self):
             yield from self._batch_iterator(bucket)
 
 
+def upload_checkpoint_to_bucket(model: nn.Module, experiment_name: str, bucket_name: str):
+    try:
+        with tempfile.TemporaryDirectory(ignore_cleanup_errors=True, delete=True) as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            huggingface_hub.batch_bucket_files(
+                bucket_name,
+                add=[(os.path.join(tmp_dir, fname), f"{experiment_name}/{fname}") for fname in os.listdir(tmp_dir)],
+            )
+    except Exception as exc:
+        print(f"Failed to upload model checkpoint to hub: {exc}")
+
+
 def get_file_size(
     model: nn.Module, *, peft_config: Optional[PeftConfig], clean: bool, print_fn: Callable[..., None]
 ) -> int: