Extract get_heterogeneous_feature_mapping into MultiTaskDataset (meta-pytorch#3285)

Carl Hvarfner · meta-codesync[bot] · commit 9fb1c7a707bc · 2026-04-23T14:48:41.000-07:00
Summary: Pull Request resolved: meta-pytorch#3285 Extracts the shared feature-ordering and index-mapping logic from `HeterogeneousMTGP.construct_inputs` into `MultiTaskDataset.get_heterogeneous_feature_mapping()`, enabling reuse by `MultiTaskGP.construct_inputs` in the follow-up diff. Reviewed By: sdaulton Differential Revision: D101903471 fbshipit-source-id: 5a2c7fd85617d1f0e2eb10c7dd8f1fe0d2765d5a
diff --git a/botorch/models/heterogeneous_mtgp.py b/botorch/models/heterogeneous_mtgp.py
@@ -295,33 +295,21 @@ def construct_inputs(
                 "Heterogeneous MTGP currently only supports output_tasks=[0]. "
                 "The target task will be given the task value of 0."
             )
-        child_datasets = training_data.datasets.copy()
-        target_dataset = child_datasets.pop(training_data.target_outcome_name)
-        all_datasets = [target_dataset] + list(child_datasets.values())
-        # Use target's feature order as canonical (NO alphabetical sort).
-        # Source-only features are appended at the end.
-        all_features: list[str] = list(target_dataset.feature_names[:-1])
-        for ds in all_datasets[1:]:
-            for fn in ds.feature_names[:-1]:
-                if fn not in all_features:
-                    all_features.append(fn)
-        # Get indices mapping the features from a given dataset to all features.
-        feature_indices = [
-            [all_features.index(fn) for fn in ds.feature_names[:-1]]
-            for ds in all_datasets
-        ]
+        all_datasets, feature_indices, full_feature_dim = (
+            training_data.get_heterogeneous_feature_mapping()
+        )
         Xs = [ds.X[..., :-1] for ds in all_datasets]
         Ys = [ds.Y for ds in all_datasets]
         Yvars = (
-            None if target_dataset.Yvar is None else [ds.Yvar for ds in all_datasets]
+            None if all_datasets[0].Yvar is None else [ds.Yvar for ds in all_datasets]
         )
         all_tasks = list(range(len(all_datasets)))
         return {
             "train_Xs": Xs,
             "train_Ys": Ys,
             "train_Yvars": Yvars,
             "feature_indices": feature_indices,
-            "full_feature_dim": len(all_features),
+            "full_feature_dim": full_feature_dim,
             "rank": rank,
             "use_saas_prior": use_saas_prior,
             "use_combinatorial_kernel": use_combinatorial_kernel,
diff --git a/botorch/utils/datasets.py b/botorch/utils/datasets.py
@@ -542,6 +542,46 @@ def __eq__(self, other: Any) -> bool:
             and self.task_feature_index == other.task_feature_index
         )
 
+    def get_heterogeneous_feature_mapping(
+        self,
+    ) -> tuple[list["SupervisedDataset"], list[list[int]], int]:
+        """Compute canonical feature ordering for heterogeneous datasets.
+
+        Target features come first (preserving order), then source-only
+        features are appended. The task column (at ``task_feature_index``)
+        is excluded from the mapping.
+
+        Returns:
+            A 3-tuple of:
+            - Ordered datasets (target first, then sources).
+            - Feature indices mapping each dataset's non-task features
+              to the canonical ordering.
+            - Full feature dimensionality (number of unique non-task features).
+
+        Raises:
+            NotImplementedError: If ``task_feature_index`` is not ``-1``.
+        """
+        if self.task_feature_index != -1:
+            raise NotImplementedError(
+                "Heterogeneous feature mapping requires `task_feature_index` to be -1."
+            )
+        child_datasets = self.datasets.copy()
+        target_dataset = child_datasets.pop(self.target_outcome_name)
+        all_datasets = [target_dataset] + list(child_datasets.values())
+
+        # Target's feature order is canonical; source-only features appended.
+        all_features: list[str] = list(target_dataset.feature_names[:-1])
+        for ds in all_datasets[1:]:
+            for fn in ds.feature_names[:-1]:
+                if fn not in all_features:
+                    all_features.append(fn)
+
+        feature_indices = [
+            [all_features.index(fn) for fn in ds.feature_names[:-1]]
+            for ds in all_datasets
+        ]
+        return all_datasets, feature_indices, len(all_features)
+
     def clone(
         self, deepcopy: bool = False, mask: Tensor | None = None
     ) -> MultiTaskDataset:
diff --git a/test/utils/test_datasets.py b/test/utils/test_datasets.py
@@ -491,6 +491,31 @@ def test_multi_task(self):
             MultiTaskDataset(datasets=[dataset_1, dataset_5], target_outcome_name="z"),
         )
 
+    def test_get_heterogeneous_feature_mapping(self):
+        ds_target = make_dataset(
+            d=3, feature_names=["a", "b", "task"], outcome_names=["y"]
+        )
+        ds_source = make_dataset(
+            d=3, feature_names=["a", "c", "task"], outcome_names=["z"]
+        )
+        mt_err = MultiTaskDataset(
+            datasets=[ds_target, ds_source],
+            target_outcome_name="y",
+            task_feature_index=0,
+        )
+        with self.assertRaises(NotImplementedError):
+            mt_err.get_heterogeneous_feature_mapping()
+
+        mt = MultiTaskDataset(
+            datasets=[ds_target, ds_source],
+            target_outcome_name="y",
+            task_feature_index=-1,
+        )
+        all_datasets, feature_indices, full_dim = mt.get_heterogeneous_feature_mapping()
+        self.assertEqual(len(all_datasets), 2)
+        self.assertEqual(full_dim, 3)
+        self.assertEqual(feature_indices, [[0, 1], [0, 2]])
+
     def test_clone_multitask(self) -> None:
         for has_yvar in [False, True]:
             dataset_1 = make_dataset(outcome_names=["y"], has_yvar=has_yvar)