Updated optim + model files in respect to upstream

Sahran Ashoor · Sahran Ashoor · commit c2ed4d6aeb6f · 2025-07-29T03:41:54.000-04:00
diff --git a/botorch/models/fully_bayesian_multitask.py b/botorch/models/fully_bayesian_multitask.py
@@ -19,12 +19,14 @@
     reshape_and_detach,
     SaasPyroModel,
 )
+from botorch.models.gpytorch import BatchedMultiOutputGPyTorchModel
 from botorch.models.multitask import MultiTaskGP
 from botorch.models.transforms.input import InputTransform
 from botorch.models.transforms.outcome import OutcomeTransform
 from botorch.posteriors.fully_bayesian import GaussianMixturePosterior
 from gpytorch.distributions import MultivariateNormal
-from gpytorch.kernels import IndexKernel, MaternKernel
+from gpytorch.kernels import MaternKernel
+from gpytorch.kernels.index_kernel import IndexKernel
 from gpytorch.kernels.kernel import Kernel
 from gpytorch.likelihoods.likelihood import Likelihood
 from gpytorch.means.mean import Mean
@@ -137,7 +139,7 @@ def sample_task_lengthscale(
 
     def load_mcmc_samples(
         self, mcmc_samples: dict[str, Tensor]
-    ) -> tuple[Mean, Kernel, Likelihood, Kernel, Parameter]:
+    ) -> tuple[Mean, Kernel, Likelihood, Kernel]:
         r"""Load the MCMC samples into the mean_module, covar_module, and likelihood."""
         tkwargs = {"device": self.train_X.device, "dtype": self.train_X.dtype}
         num_mcmc_samples = len(mcmc_samples["mean"])
@@ -406,30 +408,7 @@ def posterior(
 
     def forward(self, X: Tensor) -> MultivariateNormal:
         self._check_if_fitted()
-        x_basic, task_idcs = self._split_inputs(X)
-
-        mean_x = self.mean_module(x_basic)
-        covar_x = self.covar_module(x_basic)
-
-        tsub_idcs = task_idcs.squeeze(-1)
-        if tsub_idcs.ndim > 1:
-            tsub_idcs = tsub_idcs.squeeze(-2)
-        latent_features = self.latent_features[:, tsub_idcs, :]
-
-        if X.ndim > 3:
-            # batch eval mode
-            # for X (batch_shape x num_samples x q x d), task_idcs[:,i,:,] are the same
-            # reshape X to (batch_shape x num_samples x q x d)
-            latent_features = latent_features.permute(
-                [-i for i in range(X.ndim - 1, 2, -1)]
-                + [0]
-                + [-i for i in range(2, 0, -1)]
-            )
-
-        # Combine the two in an ICM fashion
-        covar_i = self.task_covar_module(latent_features)
-        covar = covar_x.mul(covar_i)
-        return MultivariateNormal(mean_x, covar)
+        return super().forward(X)
 
     def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
         r"""Custom logic for loading the state dict.
@@ -474,3 +453,37 @@ def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
         ) = self.pyro_model.load_mcmc_samples(mcmc_samples=mcmc_samples)
         # Load the actual samples from the state dict
         super().load_state_dict(state_dict=state_dict, strict=strict)
+
+    def condition_on_observations(
+        self, X: Tensor, Y: Tensor, **kwargs: Any
+    ) -> BatchedMultiOutputGPyTorchModel:
+        """Conditions on additional observations for a Fully Bayesian model (either
+        identical across models or unique per-model).
+
+        Args:
+            X: A `batch_shape x num_samples x d`-dim Tensor, where `d` is
+                the dimension of the feature space and `batch_shape` is the number of
+                sampled models.
+            Y: A `batch_shape x num_samples x 1`-dim Tensor, where `d` is
+                the dimension of the feature space and `batch_shape` is the number of
+                sampled models.
+
+        Returns:
+            BatchedMultiOutputGPyTorchModel: A fully bayesian model conditioned on
+              given observations. The returned model has `batch_shape` copies of the
+              training data in case of identical observations (and `batch_shape`
+              training datasets otherwise).
+        """
+        if X.ndim == 2 and Y.ndim == 2:
+            # To avoid an error in GPyTorch when inferring the batch dimension, we add
+            # the explicit batch shape here. The result is that the conditioned model
+            # will have 'batch_shape' copies of the training data.
+            X = X.repeat(self.batch_shape + (1, 1))
+            Y = Y.repeat(self.batch_shape + (1, 1))
+
+        elif X.ndim < Y.ndim:
+            # We need to duplicate the training data to enable correct batch
+            # size inference in gpytorch.
+            X = X.repeat(*(Y.shape[:-2] + (1, 1)))
+
+        return super().condition_on_observations(X, Y, **kwargs)
diff --git a/botorch/optim/optimize_mixed.py b/botorch/optim/optimize_mixed.py
@@ -8,8 +8,7 @@
 import itertools
 import random
 import warnings
-from collections.abc import Sequence
-from typing import Any, Callable
+from typing import Any, Callable, Sequence
 
 import torch
 from botorch.acquisition import AcquisitionFunction
@@ -574,6 +573,7 @@ def generate_starting_points(
             X_baseline=X_baseline,
             cont_dims=cont_dims,
             discrete_dims=discrete_dims,
+            cat_dims=cat_dims,
             bounds=bounds,
             num_spray_points=num_spray_points,
             std_cont_perturbation=assert_is_instance(
@@ -598,6 +598,7 @@ def generate_starting_points(
         new_x_init = sample_feasible_points(
             opt_inputs=opt_inputs,
             discrete_dims=discrete_dims,
+            cat_dims=cat_dims,
             num_points=num_restarts - len(x_init_candts),
         )
         x_init_candts = torch.cat([x_init_candts, new_x_init], dim=0)
@@ -817,19 +818,19 @@ def optimize_acqf_mixed_alternating(
     inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
 ) -> tuple[Tensor, Tensor]:
     r"""
-    Optimizes acquisition function over mixed binary and continuous input spaces.
-    Multiple random restarting starting points are picked by evaluating a large set
-    of initial candidates. From each starting point, alternating discrete local search
-    and continuous optimization via (L-BFGS) is performed for a fixed number of
-    iterations.
+    Optimizes acquisition function over mixed integer, categorical, and continuous
+    input spaces. Multiple random restarting starting points are picked by evaluating
+    a large set of initial candidates. From each starting point, alternating
+    discrete/categorical local search and continuous optimization via (L-BFGS)
+    is performed for a fixed number of iterations.
 
     NOTE: This method assumes that all categorical variables are
     integer valued.
     The discrete dimensions that have more than
     `options.get("max_discrete_values", MAX_DISCRETE_VALUES)` values will
     be optimized using continuous relaxation.
-
-    # TODO: Support categorical variables.
+    The categorical dimensions that have more than `MAX_DISCRETE_VALUES` values
+    be optimized by selecting random subsamples of the possible values.
 
     Args:
         acq_function: BoTorch Acquisition function.
@@ -982,14 +983,14 @@ def optimize_acqf_mixed_alternating(
             )
         )
     if not (
-        isinstance(discrete_dims, list)
-        and len(set(discrete_dims)) == len(discrete_dims)
-        and min(discrete_dims) >= 0
-        and max(discrete_dims) <= dim - 1
+        isinstance(non_cont_dims, list)
+        and len(set(non_cont_dims)) == len(non_cont_dims)
+        and min(non_cont_dims) >= 0
+        and max(non_cont_dims) <= dim - 1
     ):
         raise ValueError(
-            "`discrete_dims` must be a list with unique integers "
-            "between 0 and num_dims - 1."
+            "`discrete_dims` and `cat_dims` must be lists with unique, disjoint "
+            "integers between 0 and num_dims - 1."
         )
     discrete_dims_t = torch.tensor(
         list(discrete_dims.keys()), dtype=torch.long, device=tkwargs["device"]
diff --git a/test/models/test_fully_bayesian_multitask.py b/test/models/test_fully_bayesian_multitask.py
@@ -56,7 +56,6 @@
 from gpytorch.means import ConstantMean
 
 EXPECTED_KEYS = [
-    "latent_features",
     "mean_module.raw_constant",
     "covar_module.kernels.1.raw_var",
     "covar_module.kernels.1.active_dims",
@@ -112,7 +111,7 @@ def _get_data_and_model(
         )
         return train_X, train_Y, train_Yvar, model
 
-    def _get_unnormalized_data(self, **tkwargs):
+    def _get_unnormalized_data(self, infer_noise: bool = False, **tkwargs):
         with torch.random.fork_rng():
             torch.manual_seed(0)
             train_X = torch.rand(10, 4, **tkwargs)
@@ -122,9 +121,28 @@ def _get_unnormalized_data(self, **tkwargs):
             )
             train_X = torch.cat([5 + 5 * train_X, task_indices], dim=1)
             test_X = 5 + 5 * torch.rand(5, 4, **tkwargs)
-            train_Yvar = 0.1 * torch.arange(10, **tkwargs).unsqueeze(-1)
+            if infer_noise:
+                train_Yvar = None
+            else:
+                train_Yvar = 0.1 * torch.arange(10, **tkwargs).unsqueeze(-1)
         return train_X, train_Y, train_Yvar, test_X
 
+    def _get_unnormalized_condition_data(
+        self, num_models: int, num_cond: int, dim: int, infer_noise: bool, **tkwargs
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
+        with torch.random.fork_rng():
+            torch.manual_seed(0)
+            cond_X = 5 + 5 * torch.rand(num_models, num_cond, dim, **tkwargs)
+            cond_Y = 10 + torch.sin(cond_X[..., :1])
+            cond_Yvar = (
+                None if infer_noise else 0.1 * torch.ones(cond_Y.shape, **tkwargs)
+            )
+        # adding the task dimension
+        cond_X = torch.cat(
+            [cond_X, torch.zeros(num_models, num_cond, 1, **tkwargs)], dim=-1
+        )
+        return cond_X, cond_Y, cond_Yvar
+
     def _get_mcmc_samples(self, num_samples: int, dim: int, task_rank: int, **tkwargs):
         mcmc_samples = {
             "lengthscale": torch.rand(num_samples, 1, dim, **tkwargs),
@@ -604,6 +622,110 @@ def test_acquisition_functions(self):
                         )
                     self.assertEqual(acqf(test_X).shape, torch.Size(batch_shape))
 
+    def test_condition_on_observation(self) -> None:
+        # The following conditioned data shapes should work (output describes):
+        # training data shape after cond(batch shape in output is req. in gpytorch)
+        # X: num_models x n x d, Y: num_models x n x d --> num_models x n x d
+        # X: n x d, Y: n x d --> num_models x n x d
+        # X: n x d, Y: num_models x n x d --> num_models x n x d
+        num_models = 3
+        num_cond = 2
+        task_rank = 2
+        for infer_noise, dtype in itertools.product(
+            (True, False), (torch.float, torch.double)
+        ):
+            tkwargs = {"device": self.device, "dtype": dtype}
+            train_X, _, _, model = self._get_data_and_model(
+                task_rank=task_rank,
+                infer_noise=infer_noise,
+                **tkwargs,
+            )
+            num_dims = train_X.shape[1] - 1
+            mcmc_samples = self._get_mcmc_samples(
+                num_samples=3,
+                dim=num_dims,
+                task_rank=task_rank,
+                **tkwargs,
+            )
+            model.load_mcmc_samples(mcmc_samples)
+
+            num_train = train_X.shape[0]
+            test_X = torch.rand(num_models, num_dims, **tkwargs)
+
+            cond_X, cond_Y, cond_Yvar = self._get_unnormalized_condition_data(
+                num_models=num_models,
+                num_cond=num_cond,
+                infer_noise=infer_noise,
+                dim=num_dims,
+                **tkwargs,
+            )
+
+            # need to forward pass before conditioning
+            model.posterior(train_X)
+            cond_model = model.condition_on_observations(
+                cond_X, cond_Y, noise=cond_Yvar
+            )
+            posterior = cond_model.posterior(test_X)
+            self.assertEqual(
+                posterior.mean.shape, torch.Size([num_models, len(test_X), 2])
+            )
+
+            # since the data is not equal for the conditioned points, a batch size
+            # is added to the training data
+            self.assertEqual(
+                cond_model.train_inputs[0].shape,
+                torch.Size([num_models, num_train + num_cond, num_dims + 1]),
+            )
+
+            # the batch shape of the condition model is added during conditioning
+            self.assertEqual(cond_model.batch_shape, torch.Size([num_models]))
+
+            # condition on identical sets of data (i.e. one set) for all models
+            # i.e, with no batch shape. This infers the batch shape.
+            cond_X_nobatch, cond_Y_nobatch = cond_X[0], cond_Y[0]
+
+            # conditioning without a batch size - the resulting conditioned model
+            # will still have a batch size
+            model.posterior(train_X)
+            cond_model = model.condition_on_observations(
+                cond_X_nobatch, cond_Y_nobatch, noise=cond_Yvar
+            )
+            self.assertEqual(
+                cond_model.train_inputs[0].shape,
+                torch.Size([num_models, num_train + num_cond, num_dims + 1]),
+            )
+
+            # With batch size only on Y.
+            cond_model = model.condition_on_observations(
+                cond_X_nobatch, cond_Y, noise=cond_Yvar
+            )
+            self.assertEqual(
+                cond_model.train_inputs[0].shape,
+                torch.Size([num_models, num_train + num_cond, num_dims + 1]),
+            )
+
+            # test repeated conditioning
+            repeat_cond_X = cond_X.clone()
+            repeat_cond_X[..., 0:-1] += 2
+            repeat_cond_model = cond_model.condition_on_observations(
+                repeat_cond_X, cond_Y, noise=cond_Yvar
+            )
+            self.assertEqual(
+                repeat_cond_model.train_inputs[0].shape,
+                torch.Size([num_models, num_train + 2 * num_cond, num_dims + 1]),
+            )
+
+            # test repeated conditioning without a batch size
+            repeat_cond_X_nobatch = cond_X_nobatch.clone()
+            repeat_cond_X_nobatch[..., 0:-1] += 2
+            repeat_cond_model2 = repeat_cond_model.condition_on_observations(
+                repeat_cond_X_nobatch, cond_Y_nobatch, noise=cond_Yvar
+            )
+            self.assertEqual(
+                repeat_cond_model2.train_inputs[0].shape,
+                torch.Size([num_models, num_train + 3 * num_cond, num_dims + 1]),
+            )
+
     def test_load_samples(self):
         for task_rank, dtype, use_outcome_transform in itertools.product(
             [1, 2], [torch.float, torch.double], (False, True)
@@ -671,18 +793,6 @@ def test_load_samples(self):
                     train_Yvar_tf.clamp(MIN_INFERRED_NOISE_LEVEL),
                 )
             )
-            self.assertTrue(
-                torch.allclose(
-                    model.task_covar_module.lengthscale,
-                    mcmc_samples["task_lengthscale"],
-                )
-            )
-            self.assertTrue(
-                torch.allclose(
-                    model.latent_features,
-                    mcmc_samples["latent_features"],
-                )
-            )
 
     def test_construct_inputs(self):
         for dtype, infer_noise in [(torch.float, False), (torch.double, True)]:
diff --git a/test/optim/test_optimize_mixed.py b/test/optim/test_optimize_mixed.py