Merged PR 1685054: Add more logs and function wait_futures for easier post analysis (#1438)

thinkall · web-flow · commit 22911ea1ef9d · 2025-05-27T15:32:56.000+08:00
- Add function wait_futures for easier post analysis
- Use logger instead of print

----
#### AI description  (iteration 1)
#### PR Classification
A code enhancement for debugging asynchronous mlflow logging and improving post-run analysis.

#### PR Summary
This PR adds detailed debug logging to the mlflow integration and introduces a new `wait_futures` function to streamline the collection of asynchronous task results for improved analysis.
- `flaml/fabric/mlflow.py`: Added debug log statements around starting and ending mlflow runs to trace run IDs and execution flow.
- `flaml/automl/automl.py`: Implemented the `wait_futures` function to handle asynchronous task results and replaced a print call with `logger.info` for consistent logging.
&lt;!-- GitOpsUserAgent=GitOps.Apps.Server.pullrequestcopilot --&gt;

Related work items: #4029592
diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py
@@ -1732,7 +1732,7 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds):
                 if not (mlflow.active_run() is not None or is_autolog_enabled()):
                     self.mlflow_integration.only_history = True
             except KeyError:
-                print("Not in Fabric, Skipped")
+                logger.info("Not in Fabric, Skipped")
         task.validate_data(
             self,
             self._state,
@@ -2756,6 +2756,9 @@ def _search(self):
                                     )
                 else:
                     logger.warning("not retraining because the time budget is too small.")
+        self.wait_futures()
+
+    def wait_futures(self):
         if self.mlflow_integration is not None:
             logger.debug("Collecting results from submitted record_state tasks")
             t1 = time.perf_counter()
@@ -2775,6 +2778,8 @@ def _search(self):
                     logger.warning(f"Exception for log_model task {_task}: {e}")
             t2 = time.perf_counter()
             logger.debug(f"Collecting results from tasks submitted to executors costs {t2-t1} seconds.")
+        else:
+            logger.debug("No futures to wait for.")
 
     def __del__(self):
         if (
diff --git a/flaml/fabric/mlflow.py b/flaml/fabric/mlflow.py
@@ -516,14 +516,19 @@ def log_model(self, model, estimator, signature=None, run_id=None):
         )
         run = mlflow.active_run()
         if run and run.info.run_id == self.parent_run_id:
+            logger.debug(
+                f"Current active run_id {run.info.run_id} == parent_run_id {self.parent_run_id}, Starting run_id {run_id}"
+            )
             mlflow.start_run(run_id=run_id, nested=True)
         elif run and run.info.run_id != run_id:
             ret_message = (
                 f"Error: Should log_model {estimator} to run_id {run_id}, but logged to run_id {run.info.run_id}"
             )
             logger.error(ret_message)
         else:
+            logger.debug(f"No active run, start run_id {run_id}")
             mlflow.start_run(run_id=run_id)
+        logger.debug(f"logged model {estimator} to run_id {mlflow.active_run().info.run_id}")
         if estimator.endswith("_spark"):
             # mlflow.spark.log_model(model, estimator, signature=signature)
             mlflow.spark.log_model(model, "model", signature=signature)
@@ -550,6 +555,7 @@ def log_model(self, model, estimator, signature=None, run_id=None):
         )
         self.futures[future] = f"run_{run_id}_requirements_updated"
         if not run or run.info.run_id == self.parent_run_id:
+            logger.debug(f"Ending current run_id {mlflow.active_run().info.run_id}")
             mlflow.end_run()
         return ret_message
 
@@ -575,12 +581,19 @@ def _log_pipeline(self, pipeline, flavor_name, pipeline_name, signature, run_id,
         )
         run = mlflow.active_run()
         if run and run.info.run_id == self.parent_run_id:
+            logger.debug(
+                f"Current active run_id {run.info.run_id} == parent_run_id {self.parent_run_id}, Starting run_id {run_id}"
+            )
             mlflow.start_run(run_id=run_id, nested=True)
         elif run and run.info.run_id != run_id:
             ret_message = f"Error: Should _log_pipeline {flavor_name}:{pipeline_name}:{estimator} model to run_id {run_id}, but logged to run_id {run.info.run_id}"
             logger.error(ret_message)
         else:
+            logger.debug(f"No active run, start run_id {run_id}")
             mlflow.start_run(run_id=run_id)
+        logger.debug(
+            f"logging pipeline {flavor_name}:{pipeline_name}:{estimator} to run_id {mlflow.active_run().info.run_id}"
+        )
         if flavor_name == "sklearn":
             mlflow.sklearn.log_model(pipeline, pipeline_name, signature=signature)
         elif flavor_name == "spark":
@@ -596,6 +609,7 @@ def _log_pipeline(self, pipeline, flavor_name, pipeline_name, signature, run_id,
         )
         self.futures[future] = f"run_{run_id}_requirements_updated"
         if not run or run.info.run_id == self.parent_run_id:
+            logger.debug(f"Ending current run_id {mlflow.active_run().info.run_id}")
             mlflow.end_run()
         return ret_message