Clean redo of ProductKernel MTGP adjustments

seashoo · seashoo · commit bf3a70ee5a8c · 2025-10-14T17:33:11.000-05:00
diff --git a/botorch/sampling/pathwise/features/maps.py b/botorch/sampling/pathwise/features/maps.py
@@ -122,28 +122,42 @@ def forward(self, x: Tensor, **kwargs: Any) -> Tensor:
         shape = self.raw_output_shape
         ndim = len(shape)
         for feature_map in self:
+            # Collect/scale individual feature blocks
             block = feature_map(x, **kwargs).to_dense()
             block_ndim = len(feature_map.output_shape)
+            
+            # Handle broadcasting for lower-dimensional feature maps
             if block_ndim < ndim:
+                # Determine how the tiling/broadcasting works for lower-dimensional feature maps
                 tile_shape = shape[-ndim:-block_ndim]
                 num_copies = prod(tile_shape)
+                
+                # Scale down by sqrt of number of copies to maintain proper variance
                 if num_copies > 1:
                     block = block * (num_copies**-0.5)
 
+                # Create multi-index for broadcasting: add None dimensions for tiling
+                # This expands the block to match the target dimensionality
                 multi_index = (
                     ...,
-                    *repeat(None, ndim - block_ndim),
-                    *repeat(slice(None), block_ndim),
+                    *repeat(None, ndim - block_ndim),  # Add new axes for tiling
+                    *repeat(slice(None), block_ndim),  # Keep existing dimensions
                 )
+                # Apply the multi-index and expand to tile across the new dimensions
                 block = block[multi_index].expand(
                     *block.shape[:-block_ndim], *tile_shape, *block.shape[-block_ndim:]
                 )
             blocks.append(block)
 
+        # Concatenate all blocks along the last dimension
         return torch.concat(blocks, dim=-1)
 
     @property
     def raw_output_shape(self) -> Size:
+        # Handle empty DirectSumFeatureMap case - can occur when:
+        # 1. Purposely start with an empty container and plan to append feature maps later, or
+        # 2. Deleted the last entry and the list is now length-zero.
+        # Returning Size([]) keeps the object in a queryable state until real feature maps are added.
         if not self:
             return Size([])
 
@@ -203,17 +217,25 @@ def forward(self, x: Tensor, **kwargs: Any) -> Tensor:
         for feature_map in self:
             block = feature_map(x, **kwargs)
             block_ndim = len(feature_map.output_shape)
+            
+            # Handle blocks that match the target dimensionality
             if block_ndim == ndim:
+                # Convert LinearOperator to dense tensor if needed
                 block = block.to_dense() if isinstance(block, LinearOperator) else block
+                # Ensure block is in sparse format for efficient block diagonal construction
                 block = block if block.is_sparse else block.to_sparse()
             else:
+                # For lower-dimensional blocks, we need to expand dimensions
+                # but keep them dense since sparse tensor broadcasting is limited
                 multi_index = (
                     ...,
-                    *repeat(None, ndim - block_ndim),
-                    *repeat(slice(None), block_ndim),
+                    *repeat(None, ndim - block_ndim),  # Add new axes for expansion
+                    *repeat(slice(None), block_ndim),  # Keep existing dimensions
                 )
                 block = block.to_dense()[multi_index]
             blocks.append(block)
+        
+        # Construct sparse block diagonal matrix from all blocks
         return sparse_block_diag(blocks, base_ndim=ndim)
 
 
diff --git a/botorch/sampling/pathwise/prior_samplers.py b/botorch/sampling/pathwise/prior_samplers.py
@@ -150,11 +150,20 @@ def _draw_kernel_feature_paths_MultiTaskGP(
     )
 
     # Extract kernels from the product kernel structure
-    # model.covar_module is a ProductKernel
+    # model.covar_module is a ProductKernel by definition for MTGPs
     # containing data_covar_module * task_covar_module
     from gpytorch.kernels import ProductKernel
 
-    if isinstance(model.covar_module, ProductKernel):
+    if not isinstance(model.covar_module, ProductKernel):
+        # Fallback for non-ProductKernel cases (legacy support)
+        import warnings
+        warnings.warn(
+            f"MultiTaskGP with non-ProductKernel detected ({type(model.covar_module)}). "
+            "Consider using ProductKernel(IndexKernel, SomeOtherKernel) for better compatibility.",
+            UserWarning,
+        )
+        combined_kernel = model.covar_module
+    else:
         # Get the individual kernels from the product kernel
         kernels = model.covar_module.kernels
 
@@ -169,7 +178,7 @@ def _draw_kernel_feature_paths_MultiTaskGP(
                 else:
                     data_kernel = deepcopy(kernel)
             else:
-                # If no active_dims, it's likely the data kernel
+                # If no active_dims on data kernel, add them so downstream helpers don't error
                 data_kernel = deepcopy(kernel)
                 data_kernel.active_dims = torch.LongTensor(
                     [
@@ -180,7 +189,7 @@ def _draw_kernel_feature_paths_MultiTaskGP(
                     device=data_kernel.device,
                 )
 
-        # If we couldn't find the task kernel, create it based on the structure
+        # If the task kernel can't be found, create it based on the structure
         if task_kernel is None:
             from gpytorch.kernels import IndexKernel
 
@@ -190,14 +199,15 @@ def _draw_kernel_feature_paths_MultiTaskGP(
                 active_dims=[task_index],
             ).to(device=model.covar_module.device, dtype=model.covar_module.dtype)
 
-        # Set task kernel active dims correctly
-        task_kernel.active_dims = torch.tensor([task_index], device=task_kernel.device)
+        # Ensure the data kernel was found
+        if data_kernel is None:
+            raise ValueError(
+                f"Could not identify data kernel from ProductKernel. "
+                "MTGPs should follow the standard ProductKernel(IndexKernel, SomeOtherKernel) pattern."
+            )
 
         # Use the existing product kernel structure
         combined_kernel = data_kernel * task_kernel
-    else:
-        # Fallback to using the original covar_module directly
-        combined_kernel = model.covar_module
 
     return _draw_kernel_feature_paths_fallback(
         mean_module=model.mean_module,
diff --git a/botorch/sampling/pathwise/update_strategies.py b/botorch/sampling/pathwise/update_strategies.py
@@ -174,11 +174,21 @@ def _draw_kernel_feature_paths_MultiTaskGP(
     )
 
     # Extract kernels from the product kernel structure
-    # model.covar_module is a ProductKernel
+    # model.covar_module is a ProductKernel by definition for MTGPs
     # containing data_covar_module * task_covar_module
     from gpytorch.kernels import ProductKernel
 
-    if isinstance(model.covar_module, ProductKernel):
+    if not isinstance(model.covar_module, ProductKernel):
+        # Fallback for non-ProductKernel cases (legacy support)
+        # This should be rare as MTGPs typically use ProductKernels by definition
+        import warnings
+        warnings.warn(
+            f"MultiTaskGP with non-ProductKernel detected ({type(model.covar_module)}). "
+            "Consider using ProductKernel(IndexKernel, SomeOtherKernel) for better compatibility.",
+            UserWarning,
+        )
+        combined_kernel = model.covar_module
+    else:
         # Get the individual kernels from the product kernel
         kernels = model.covar_module.kernels
 
@@ -193,7 +203,7 @@ def _draw_kernel_feature_paths_MultiTaskGP(
                 else:
                     data_kernel = deepcopy(kernel)
             else:
-                # If no active_dims, it's likely the data kernel
+                # If no active_dims on data kernel, add them so downstream helpers don't error
                 data_kernel = deepcopy(kernel)
                 data_kernel.active_dims = torch.LongTensor(
                     [index for index in range(num_inputs) if index != task_index],
@@ -210,16 +220,15 @@ def _draw_kernel_feature_paths_MultiTaskGP(
                 active_dims=[task_index],
             ).to(device=model.covar_module.device, dtype=model.covar_module.dtype)
 
-        # Set task kernel active dims correctly
-        task_kernel.active_dims = torch.LongTensor(
-            [task_index], device=task_kernel.device
-        )
+        # Ensure data kernel was found
+        if data_kernel is None:
+            raise ValueError(
+                f"Could not identify data kernel from ProductKernel. "
+                "MTGPs should follow the standard ProductKernel(IndexKernel, SomeOtherKernel) pattern."
+            )
 
         # Use the existing product kernel structure
         combined_kernel = data_kernel * task_kernel
-    else:
-        # Fallback to using the original covar_module directly
-        combined_kernel = model.covar_module
 
     # Return exact update using product kernel
     return _gaussian_update_exact(