PR response changes pt. 1 (mainly ProductKernel assertion adjustments)

seashoo · seashoo · commit d0314bcec0cb · 2025-09-08T07:41:48.000-05:00
diff --git a/botorch/sampling/pathwise/features/maps.py b/botorch/sampling/pathwise/features/maps.py
@@ -118,18 +118,45 @@ def __init__(
         self.output_transform = output_transform
 
     def forward(self, x: Tensor, **kwargs: Any) -> Tensor:
-        blocks = []
-        shape = self.raw_output_shape
-        ndim = len(shape)
+        # Collect the (possibly broadcasted) feature tensors in `blocks`.
+        # ``self.raw_output_shape`` encodes the *final* shape of the concatenated
+        # feature map.  For each individual ``feature_map`` we therefore need to
+        # (1) obtain its dense representation, (2) broadcast it so that its
+        # trailing dimensions match ``self.raw_output_shape`` and (3) rescale it
+        # if we replicate the same tensor multiple times (to keep ‖ϕ‖ roughly
+        # invariant).
+
+        blocks: list[Tensor] = []
+
+        shape = self.raw_output_shape              # target output shape
+        ndim = len(shape)                          # #feature dimensions incl. batch
+
         for feature_map in self:
+            # 1. Evaluate (dense) features for the current sub-map.
             block = feature_map(x, **kwargs).to_dense()
+
             block_ndim = len(feature_map.output_shape)
+
+            # 2. If this map has fewer *feature* dimensions than the direct sum
+            #    (e.g. vector-valued sub-map in a matrix-valued direct-sum) we have
+            #    to *tile* it along the missing leading feature dimensions so that
+            #    shapes line up for concatenation.
             if block_ndim < ndim:
+                # ``tile_shape`` tells us how many copies we need along every
+                # missing feature dimension (could be >1 for e.g. Kronecker sums).
                 tile_shape = shape[-ndim:-block_ndim]
+
+                # Rescale by 1/√k when we replicate the same block *k* times to
+                # avoid artificially inflating its norm (motivated by the fact
+                # that direct sums of orthogonal features preserve inner-products
+                # only up to such a scaling).
                 num_copies = prod(tile_shape)
                 if num_copies > 1:
-                    block = block * (num_copies**-0.5)
+                    block = block * (num_copies ** -0.5)
 
+                # ``multi_index`` inserts ``None`` (i.e. `None` in slice syntax)
+                # so that broadcasting expands the tensor along the new axes
+                # without additional memory allocations.
                 multi_index = (
                     ...,
                     *repeat(None, ndim - block_ndim),
@@ -138,12 +165,17 @@ def forward(self, x: Tensor, **kwargs: Any) -> Tensor:
                 block = block[multi_index].expand(
                     *block.shape[:-block_ndim], *tile_shape, *block.shape[-block_ndim:]
                 )
+
+            # 3. Append the (now correctly shaped) block to be concatenated later.
             blocks.append(block)
 
+        # Concatenate along the *last* axis (feature dimension).
         return torch.concat(blocks, dim=-1)
 
     @property
     def raw_output_shape(self) -> Size:
+    # If the container is empty (e.g. DirectSumFeatureMap([])), treat the
+    # output as 0-D until feature maps are added.
         if not self:
             return Size([])
 
@@ -204,13 +236,23 @@ def forward(self, x: Tensor, **kwargs: Any) -> Tensor:
             block = feature_map(x, **kwargs)
             block_ndim = len(feature_map.output_shape)
             if block_ndim == ndim:
+                # Case 1: this sub-map already has the *max* feature-rank we’re
+                # going to emit.  We simply make sure it is stored sparsely:
+                #   – Convert `LinearOperator` → dense so that `.to_sparse()`
+                #     is available.
+                #   – If it is still dense, call `.to_sparse()`; otherwise keep
+                #     the sparse representation it already has.
                 block = block.to_dense() if isinstance(block, LinearOperator) else block
                 block = block if block.is_sparse else block.to_sparse()
             else:
+                # Case 2: lower-rank feature-map. Bring it up to `ndim` by
+                # slicing with `None` (adds singleton axes) so broadcasting will
+                # later expand it. We stay dense here because we’ll stuff the
+                # result into a block-diag sparse matrix at the very end.
                 multi_index = (
                     ...,
-                    *repeat(None, ndim - block_ndim),
-                    *repeat(slice(None), block_ndim),
+                    *repeat(None, ndim - block_ndim),  # adds missing dims
+                    *repeat(slice(None), block_ndim),  # keep existing dims
                 )
                 block = block.to_dense()[multi_index]
             blocks.append(block)
diff --git a/botorch/sampling/pathwise/prior_samplers.py b/botorch/sampling/pathwise/prior_samplers.py
@@ -149,55 +149,35 @@ def _draw_kernel_feature_paths_MultiTaskGP(
         else model._task_feature
     )
 
-    # Extract kernels from the product kernel structure
-    # model.covar_module is a ProductKernel
-    # containing data_covar_module * task_covar_module
+    # MultiTaskGP *always* wraps data_covar_module and task_covar_module in a
+    # ProductKernel (see MTGP implementation).  If that invariant is violated we
+    # raise an error rather than silently guessing how to proceed.
+
     from gpytorch.kernels import ProductKernel
 
-    if isinstance(model.covar_module, ProductKernel):
-        # Get the individual kernels from the product kernel
-        kernels = model.covar_module.kernels
-
-        # Find data and task kernels based on their active_dims
-        data_kernel = None
-        task_kernel = None
-
-        for kernel in kernels:
-            if hasattr(kernel, "active_dims") and kernel.active_dims is not None:
-                if task_index in kernel.active_dims:
-                    task_kernel = deepcopy(kernel)
-                else:
-                    data_kernel = deepcopy(kernel)
-            else:
-                # If no active_dims, it's likely the data kernel
-                data_kernel = deepcopy(kernel)
-                data_kernel.active_dims = torch.LongTensor(
-                    [
-                        index
-                        for index in range(train_X.shape[-1])
-                        if index != task_index
-                    ],
-                    device=data_kernel.device,
-                )
-
-        # If we couldn't find the task kernel, create it based on the structure
-        if task_kernel is None:
-            from gpytorch.kernels import IndexKernel
-
-            task_kernel = IndexKernel(
-                num_tasks=model.num_tasks,
-                rank=model._rank,
-                active_dims=[task_index],
-            ).to(device=model.covar_module.device, dtype=model.covar_module.dtype)
-
-        # Set task kernel active dims correctly
-        task_kernel.active_dims = torch.tensor([task_index], device=task_kernel.device)
-
-        # Use the existing product kernel structure
-        combined_kernel = data_kernel * task_kernel
-    else:
-        # Fallback to using the original covar_module directly
-        combined_kernel = model.covar_module
+    if not isinstance(model.covar_module, ProductKernel):
+        raise RuntimeError(
+            "Expected `model.covar_module` to be a ProductKernel (data × task), "
+            "but found {type(model.covar_module).__name__}. If you are wrapping "
+            "kernels manually please combine them with gpytorch.kernels.ProductKernel "
+            "so the path-wise utilities can reason about the structure."
+        )
+
+    # The product already represents data_kernel * task_kernel; we can pass it
+    # straight through to downstream routines.
+    combined_kernel = model.covar_module
+
+    # Ensure the data kernel inside the product has `active_dims` set; this is
+    # required downstream by `get_kernel_num_inputs`.  MTGPs created via the
+    # public constructor already do this, but if a user manually overwrote the
+    # `covar_module` we may need to patch it up here.
+    kernels = combined_kernel.kernels  # type: ignore[attr-defined]
+    for k in kernels:
+        if getattr(k, "active_dims", None) is None:
+            k.active_dims = torch.LongTensor(
+                [idx for idx in range(num_ambient_inputs) if idx != task_index],
+                device=k.device,
+            )
 
     return _draw_kernel_feature_paths_fallback(
         mean_module=model.mean_module,
diff --git a/botorch/sampling/pathwise/update_strategies.py b/botorch/sampling/pathwise/update_strategies.py
@@ -164,62 +164,34 @@ def _draw_kernel_feature_paths_MultiTaskGP(
     if noise_covariance is None:
         noise_covariance = likelihood.noise_covar(shape=points.shape[:-1])
 
-    # Prepare product kernel
+    # Determine total input dimensionality and identify the task-feature index.
     num_inputs = points.shape[-1]
-    # TODO: Changed `MultiTaskGP` to normalize the task feature in its constructor.
     task_index = (
-        num_inputs + model._task_feature
-        if model._task_feature < 0
-        else model._task_feature
+        num_inputs + model._task_feature if model._task_feature < 0 else model._task_feature
     )
 
-    # Extract kernels from the product kernel structure
-    # model.covar_module is a ProductKernel
-    # containing data_covar_module * task_covar_module
+    # MTGP should always provide a ProductKernel = data × task. Enforce that
+    # contract and surface actionable feedback if it is violated.
+
     from gpytorch.kernels import ProductKernel
 
-    if isinstance(model.covar_module, ProductKernel):
-        # Get the individual kernels from the product kernel
-        kernels = model.covar_module.kernels
-
-        # Find data and task kernels based on their active_dims
-        data_kernel = None
-        task_kernel = None
-
-        for kernel in kernels:
-            if hasattr(kernel, "active_dims") and kernel.active_dims is not None:
-                if task_index in kernel.active_dims:
-                    task_kernel = deepcopy(kernel)
-                else:
-                    data_kernel = deepcopy(kernel)
-            else:
-                # If no active_dims, it's likely the data kernel
-                data_kernel = deepcopy(kernel)
-                data_kernel.active_dims = torch.LongTensor(
-                    [index for index in range(num_inputs) if index != task_index],
-                    device=data_kernel.device,
-                )
-
-        # If we couldn't find the task kernel, create it based on the structure
-        if task_kernel is None:
-            from gpytorch.kernels import IndexKernel
-
-            task_kernel = IndexKernel(
-                num_tasks=model.num_tasks,
-                rank=model._rank,
-                active_dims=[task_index],
-            ).to(device=model.covar_module.device, dtype=model.covar_module.dtype)
-
-        # Set task kernel active dims correctly
-        task_kernel.active_dims = torch.LongTensor(
-            [task_index], device=task_kernel.device
+    if not isinstance(model.covar_module, ProductKernel):
+        raise RuntimeError(
+            "MultiTaskGP `covar_module` is expected to be a ProductKernel (data × task) "
+            f"but found {type(model.covar_module).__name__}. If you build a custom "
+            "MTGP variant please wrap the two kernels with gpytorch.kernels.ProductKernel."
         )
 
-        # Use the existing product kernel structure
-        combined_kernel = data_kernel * task_kernel
-    else:
-        # Fallback to using the original covar_module directly
-        combined_kernel = model.covar_module
+    combined_kernel = model.covar_module
+
+    # Ensure the data part of the product kernel has `active_dims` set; required
+    # by downstream helpers when calculating input dimensionality.
+    kernels = combined_kernel.kernels  # type: ignore[attr-defined]
+    for k in kernels:
+        if getattr(k, "active_dims", None) is None:
+            k.active_dims = torch.LongTensor(
+                [idx for idx in range(num_inputs) if idx != task_index], device=k.device
+            )
 
     # Return exact update using product kernel
     return _gaussian_update_exact(
diff --git a/test/sampling/pathwise/test_prior_samplers.py b/test/sampling/pathwise/test_prior_samplers.py
@@ -234,8 +234,9 @@ def test_multitask_gp_kernel_handling(self):
         simple_kernel = RBFKernel(ard_num_dims=3)
         model2.covar_module = simple_kernel  # Non-ProductKernel
 
-        paths2 = draw_kernel_feature_paths(model2, sample_shape=Size([1]))
-        self.assertIsNotNone(paths2)
+        import pytest
+        with pytest.raises(RuntimeError):
+            draw_kernel_feature_paths(model2, sample_shape=Size([1]))
 
         # Test kernel without active_dims to trigger active_dims assignment
         model3 = MultiTaskGP(train_X=train_X, train_Y=train_Y, task_feature=2)
diff --git a/test/sampling/pathwise/test_update_strategies.py b/test/sampling/pathwise/test_update_strategies.py
@@ -311,8 +311,9 @@ def test_multitask_gp_kernel_handling(self):
         simple_kernel = RBFKernel(ard_num_dims=3)
         model2.covar_module = simple_kernel  # Non-ProductKernel
 
-        update_paths2 = gaussian_update(model=model2, sample_values=sample_values)
-        self.assertIsNotNone(update_paths2)
+        import pytest
+        with pytest.raises(RuntimeError):
+            gaussian_update(model=model2, sample_values=sample_values)
 
         # Test kernel without active_dims to trigger active_dims assignment
         model3 = MultiTaskGP(train_X=train_X, train_Y=train_Y, task_feature=2)