diff --git a/.github/unittest/linux_examples/scripts/run_test.sh b/.github/unittest/linux_examples/scripts/run_test.sh
index 075489b208d..a984b37faa9 100755
--- a/.github/unittest/linux_examples/scripts/run_test.sh
+++ b/.github/unittest/linux_examples/scripts/run_test.sh
@@ -51,10 +51,12 @@ python .github/unittest/helpers/coverage_run_parallel.py sota-implementations/iq
 python .github/unittest/helpers/coverage_run_parallel.py sota-implementations/cql/cql_offline.py \
   optim.gradient_steps=55 \
   logger.backend=
-
 # ==================================================================================== #
 # ================================ Gymnasium ========================================= #
 
+python .github/unittest/helpers/coverage_run_parallel.py sota-implementations/td3_bc/td3_bc.py \
+  optim.gradient_steps=55 \
+  logger.backend=
 python .github/unittest/helpers/coverage_run_parallel.py sota-implementations/impala/impala_single_node.py \
   collector.total_frames=80 \
   collector.frames_per_batch=20 \
diff --git a/README.md b/README.md
index 5ac72ff052e..2b250dac540 100644
--- a/README.md
+++ b/README.md
@@ -501,6 +501,7 @@ A series of [examples](https://github.com/pytorch/rl/blob/main/examples/) are pr
 - [IQL](https://github.com/pytorch/rl/blob/main/sota-implementations/iql/iql_offline.py)
 - [CQL](https://github.com/pytorch/rl/blob/main/sota-implementations/cql/cql_offline.py)
 - [TD3](https://github.com/pytorch/rl/blob/main/sota-implementations/td3/td3.py)
+- [TD3+BC](https://github.com/pytorch/rl/blob/main/sota-implementations/td3+bc/td3+bc.py)
 - [A2C](https://github.com/pytorch/rl/blob/main/examples/a2c_old/a2c.py)
 - [PPO](https://github.com/pytorch/rl/blob/main/sota-implementations/ppo/ppo.py)
 - [SAC](https://github.com/pytorch/rl/blob/main/sota-implementations/sac/sac.py)
diff --git a/docs/source/reference/objectives.rst b/docs/source/reference/objectives.rst
index c2f43d8e9b6..ef9bc1ee907 100644
--- a/docs/source/reference/objectives.rst
+++ b/docs/source/reference/objectives.rst
@@ -160,6 +160,15 @@ TD3
 
     TD3Loss
 
+TD3+BC
+----
+
+.. autosummary::
+    :toctree: generated/
+    :template: rl_template_noinherit.rst
+
+    TD3BCLoss
+
 PPO
 ---
 
diff --git a/sota-check/run_td3bc.sh b/sota-check/run_td3bc.sh
new file mode 100644
index 00000000000..0fefb3ecd6f
--- /dev/null
+++ b/sota-check/run_td3bc.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+#SBATCH --job-name=td3bc_offline
+#SBATCH --ntasks=32
+#SBATCH --cpus-per-task=1
+#SBATCH --gres=gpu:1
+#SBATCH --output=slurm_logs/td3bc_offline_%j.txt
+#SBATCH --error=slurm_errors/td3bc_offline_%j.txt
+
+current_commit=$(git rev-parse --short HEAD)
+project_name="torchrl-example-check-$current_commit"
+group_name="td3bc_offline"
+export PYTHONPATH=$(dirname $(dirname $PWD))
+python $PYTHONPATH/sota-implementations/td3_bc/td3_bc.py \
+  logger.backend=wandb \
+  logger.project_name="$project_name" \
+  logger.group_name="$group_name"
+
+# Capture the exit status of the Python command
+exit_status=$?
+# Write the exit status to a file
+if [ $exit_status -eq 0 ]; then
+  echo "${group_name}_${SLURM_JOB_ID}=success" >>> report.log
+else
+  echo "${group_name}_${SLURM_JOB_ID}=error" >>> report.log
+fi
diff --git a/sota-check/submitit-release-check.sh b/sota-check/submitit-release-check.sh
index cad2783c653..515ac06a50b 100755
--- a/sota-check/submitit-release-check.sh
+++ b/sota-check/submitit-release-check.sh
@@ -65,6 +65,7 @@ scripts=(
     run_ppo_mujoco.sh
     run_sac.sh
     run_td3.sh
+    run_td3bc.sh
     run_dt.sh
     run_dt_online.sh
 )
diff --git a/sota-implementations/td3_bc/config.yaml b/sota-implementations/td3_bc/config.yaml
new file mode 100644
index 00000000000..54275a94bc2
--- /dev/null
+++ b/sota-implementations/td3_bc/config.yaml
@@ -0,0 +1,45 @@
+# task and env
+env:
+  name: HalfCheetah-v4  # Use v4 to get rid of mujoco-py dependency
+  task: ""
+  library: gymnasium
+  seed: 42
+  max_episode_steps: 1000
+
+# replay buffer
+replay_buffer:
+  dataset: halfcheetah-medium-v2
+  batch_size: 256
+
+# optim
+optim:
+  gradient_steps: 100000
+  gamma: 0.99
+  loss_function: l2
+  lr: 3.0e-4
+  weight_decay: 0.0
+  adam_eps: 1e-4
+  batch_size: 256
+  target_update_polyak: 0.995
+  policy_update_delay: 2
+  policy_noise: 0.2
+  noise_clip: 0.5
+  alpha: 2.5
+
+# network
+network:
+  hidden_sizes: [256, 256]
+  activation: relu
+  device: null
+
+# logging
+logger:
+  backend: wandb
+  project_name: td3+bc_${replay_buffer.dataset}
+  group_name: null
+  exp_name: TD3+BC_${replay_buffer.dataset}
+  mode: online
+  eval_iter: 5000
+  eval_steps: 1000
+  eval_envs: 1
+  video: False
diff --git a/sota-implementations/td3_bc/td3_bc.py b/sota-implementations/td3_bc/td3_bc.py
new file mode 100644
index 00000000000..7c43fdc1a12
--- /dev/null
+++ b/sota-implementations/td3_bc/td3_bc.py
@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""TD3+BC Example.
+
+This is a self-contained example of an offline RL TD3+BC training script.
+
+The helper functions are coded in the utils.py associated with this script.
+
+"""
+import time
+
+import hydra
+import numpy as np
+import torch
+import tqdm
+from torchrl._utils import logger as torchrl_logger
+
+from torchrl.envs import set_gym_backend
+from torchrl.envs.utils import ExplorationType, set_exploration_type
+from torchrl.record.loggers import generate_exp_name, get_logger
+
+from utils import (
+    dump_video,
+    log_metrics,
+    make_environment,
+    make_loss_module,
+    make_offline_replay_buffer,
+    make_optimizer,
+    make_td3_agent,
+)
+
+
+@hydra.main(config_path="", config_name="config")
+def main(cfg: "DictConfig"):  # noqa: F821
+    set_gym_backend(cfg.env.library).set()
+
+    # Create logger
+    exp_name = generate_exp_name("TD3BC-offline", cfg.logger.exp_name)
+    logger = None
+    if cfg.logger.backend:
+        logger = get_logger(
+            logger_type=cfg.logger.backend,
+            logger_name="td3bc_logging",
+            experiment_name=exp_name,
+            wandb_kwargs={
+                "mode": cfg.logger.mode,
+                "config": dict(cfg),
+                "project": cfg.logger.project_name,
+                "group": cfg.logger.group_name,
+            },
+        )
+
+    # Set seeds
+    torch.manual_seed(cfg.env.seed)
+    np.random.seed(cfg.env.seed)
+    device = cfg.network.device
+    if device in ("", None):
+        if torch.cuda.is_available():
+            device = "cuda:0"
+        else:
+            device = "cpu"
+    device = torch.device(device)
+
+    # Creante env
+    eval_env = make_environment(
+        cfg,
+        logger=logger,
+    )
+
+    # Create replay buffer
+    replay_buffer = make_offline_replay_buffer(cfg.replay_buffer)
+
+    # Create agent
+    model, _ = make_td3_agent(cfg, eval_env, device)
+
+    # Create loss
+    loss_module, target_net_updater = make_loss_module(cfg.optim, model)
+
+    # Create optimizer
+    optimizer_actor, optimizer_critic = make_optimizer(cfg.optim, loss_module)
+
+    gradient_steps = cfg.optim.gradient_steps
+    evaluation_interval = cfg.logger.eval_iter
+    eval_steps = cfg.logger.eval_steps
+    delayed_updates = cfg.optim.policy_update_delay
+    update_counter = 0
+    pbar = tqdm.tqdm(range(gradient_steps))
+    # Training loop
+    start_time = time.time()
+    for i in pbar:
+        pbar.update(1)
+        # Update actor every delayed_updates
+        update_counter += 1
+        update_actor = update_counter % delayed_updates == 0
+
+        # Sample from replay buffer
+        sampled_tensordict = replay_buffer.sample()
+        if sampled_tensordict.device != device:
+            sampled_tensordict = sampled_tensordict.to(device)
+        else:
+            sampled_tensordict = sampled_tensordict.clone()
+
+        # Compute loss
+        q_loss, *_ = loss_module.qvalue_loss(sampled_tensordict)
+
+        # Update critic
+        optimizer_critic.zero_grad()
+        q_loss.backward()
+        optimizer_critic.step()
+        q_loss.item()
+
+        to_log = {"q_loss": q_loss.item()}
+
+        # Update actor
+        if update_actor:
+            actor_loss, actorloss_metadata = loss_module.actor_loss(sampled_tensordict)
+            optimizer_actor.zero_grad()
+            actor_loss.backward()
+            optimizer_actor.step()
+
+            # Update target params
+            target_net_updater.step()
+
+            to_log["actor_loss"] = actor_loss.item()
+            to_log.update(actorloss_metadata)
+
+        # evaluation
+        if i % evaluation_interval == 0:
+            with set_exploration_type(ExplorationType.MODE), torch.no_grad():
+                eval_td = eval_env.rollout(
+                    max_steps=eval_steps, policy=model[0], auto_cast_to_device=True
+                )
+                eval_env.apply(dump_video)
+            eval_reward = eval_td["next", "reward"].sum(1).mean().item()
+            to_log["evaluation_reward"] = eval_reward
+        if logger is not None:
+            log_metrics(logger, to_log, i)
+
+    pbar.close()
+    torchrl_logger.info(f"Training time: {time.time() - start_time}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sota-implementations/td3_bc/utils.py b/sota-implementations/td3_bc/utils.py
new file mode 100644
index 00000000000..3772eefccde
--- /dev/null
+++ b/sota-implementations/td3_bc/utils.py
@@ -0,0 +1,257 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import functools
+
+import torch
+
+from torch import nn, optim
+from torchrl.data.datasets.d4rl import D4RLExperienceReplay
+from torchrl.data.replay_buffers import SamplerWithoutReplacement
+from torchrl.envs import (
+    CatTensors,
+    Compose,
+    DMControlEnv,
+    DoubleToFloat,
+    EnvCreator,
+    InitTracker,
+    ParallelEnv,
+    RewardSum,
+    StepCounter,
+    TransformedEnv,
+)
+from torchrl.envs.libs.gym import GymEnv, set_gym_backend
+from torchrl.envs.utils import ExplorationType, set_exploration_type
+from torchrl.modules import (
+    AdditiveGaussianWrapper,
+    MLP,
+    SafeModule,
+    SafeSequential,
+    TanhModule,
+    ValueOperator,
+)
+
+from torchrl.objectives import SoftUpdate
+from torchrl.objectives.td3_bc import TD3BCLoss
+from torchrl.record import VideoRecorder
+
+
+# ====================================================================
+# Environment utils
+# -----------------
+
+
+def env_maker(cfg, device="cpu", from_pixels=False):
+    lib = cfg.env.library
+    if lib in ("gym", "gymnasium"):
+        with set_gym_backend(lib):
+            return GymEnv(
+                cfg.env.name,
+                device=device,
+                from_pixels=from_pixels,
+                pixels_only=False,
+            )
+    elif lib == "dm_control":
+        env = DMControlEnv(
+            cfg.env.name, cfg.env.task, from_pixels=from_pixels, pixels_only=False
+        )
+        return TransformedEnv(
+            env, CatTensors(in_keys=env.observation_spec.keys(), out_key="observation")
+        )
+    else:
+        raise NotImplementedError(f"Unknown lib {lib}.")
+
+
+def apply_env_transforms(env, max_episode_steps):
+    transformed_env = TransformedEnv(
+        env,
+        Compose(
+            StepCounter(max_steps=max_episode_steps),
+            InitTracker(),
+            DoubleToFloat(),
+            RewardSum(),
+        ),
+    )
+    return transformed_env
+
+
+def make_environment(cfg, logger=None):
+    """Make environments for training and evaluation."""
+    partial = functools.partial(env_maker, cfg=cfg)
+    parallel_env = ParallelEnv(
+        cfg.logger.eval_envs,
+        EnvCreator(partial),
+        serial_for_single=True,
+    )
+    parallel_env.set_seed(cfg.env.seed)
+
+    train_env = apply_env_transforms(parallel_env, cfg.env.max_episode_steps)
+    return train_env
+
+
+# ====================================================================
+# Replay buffer
+# ---------------------------
+
+
+def make_offline_replay_buffer(rb_cfg):
+    data = D4RLExperienceReplay(
+        dataset_id=rb_cfg.dataset,
+        split_trajs=False,
+        batch_size=rb_cfg.batch_size,
+        sampler=SamplerWithoutReplacement(drop_last=False),
+        prefetch=4,
+        direct_download=True,
+    )
+
+    data.append_transform(DoubleToFloat())
+
+    return data
+
+
+# ====================================================================
+# Model
+# -----
+
+
+def make_td3_agent(cfg, train_env, device):
+    """Make TD3 agent."""
+    # Define Actor Network
+    in_keys = ["observation"]
+    action_spec = train_env.action_spec
+    if train_env.batch_size:
+        action_spec = action_spec[(0,) * len(train_env.batch_size)]
+    actor_net_kwargs = {
+        "num_cells": cfg.network.hidden_sizes,
+        "out_features": action_spec.shape[-1],
+        "activation_class": get_activation(cfg),
+    }
+
+    actor_net = MLP(**actor_net_kwargs)
+
+    in_keys_actor = in_keys
+    actor_module = SafeModule(
+        actor_net,
+        in_keys=in_keys_actor,
+        out_keys=[
+            "param",
+        ],
+    )
+    actor = SafeSequential(
+        actor_module,
+        TanhModule(
+            in_keys=["param"],
+            out_keys=["action"],
+            spec=action_spec,
+        ),
+    )
+
+    # Define Critic Network
+    qvalue_net_kwargs = {
+        "num_cells": cfg.network.hidden_sizes,
+        "out_features": 1,
+        "activation_class": get_activation(cfg),
+    }
+
+    qvalue_net = MLP(
+        **qvalue_net_kwargs,
+    )
+
+    qvalue = ValueOperator(
+        in_keys=["action"] + in_keys,
+        module=qvalue_net,
+    )
+
+    model = nn.ModuleList([actor, qvalue]).to(device)
+
+    # init nets
+    with torch.no_grad(), set_exploration_type(ExplorationType.RANDOM):
+        td = train_env.fake_tensordict()
+        td = td.to(device)
+        for net in model:
+            net(td)
+    del td
+
+    # Exploration wrappers:
+    actor_model_explore = AdditiveGaussianWrapper(
+        model[0],
+        sigma_init=1,
+        sigma_end=1,
+        mean=0,
+        std=0.1,
+        spec=action_spec,
+    ).to(device)
+    return model, actor_model_explore
+
+
+# ====================================================================
+# TD3 Loss
+# ---------
+
+
+def make_loss_module(cfg, model):
+    """Make loss module and target network updater."""
+    # Create TD3 loss
+    loss_module = TD3BCLoss(
+        actor_network=model[0],
+        qvalue_network=model[1],
+        num_qvalue_nets=2,
+        loss_function=cfg.loss_function,
+        delay_actor=True,
+        delay_qvalue=True,
+        action_spec=model[0][1].spec,
+        policy_noise=cfg.policy_noise,
+        noise_clip=cfg.noise_clip,
+        alpha=cfg.alpha,
+    )
+    loss_module.make_value_estimator(gamma=cfg.gamma)
+
+    # Define Target Network Updater
+    target_net_updater = SoftUpdate(loss_module, eps=cfg.target_update_polyak)
+    return loss_module, target_net_updater
+
+
+def make_optimizer(cfg, loss_module):
+    critic_params = list(loss_module.qvalue_network_params.values(True, True))
+    actor_params = list(loss_module.actor_network_params.values(True, True))
+
+    optimizer_actor = optim.Adam(
+        actor_params,
+        lr=cfg.lr,
+        weight_decay=cfg.weight_decay,
+        eps=cfg.adam_eps,
+    )
+    optimizer_critic = optim.Adam(
+        critic_params,
+        lr=cfg.lr,
+        weight_decay=cfg.weight_decay,
+        eps=cfg.adam_eps,
+    )
+    return optimizer_actor, optimizer_critic
+
+
+# ====================================================================
+# General utils
+# ---------
+
+
+def log_metrics(logger, metrics, step):
+    for metric_name, metric_value in metrics.items():
+        logger.log_scalar(metric_name, metric_value, step)
+
+
+def get_activation(cfg):
+    if cfg.network.activation == "relu":
+        return nn.ReLU
+    elif cfg.network.activation == "tanh":
+        return nn.Tanh
+    elif cfg.network.activation == "leaky_relu":
+        return nn.LeakyReLU
+    else:
+        raise NotImplementedError
+
+
+def dump_video(module):
+    if isinstance(module, VideoRecorder):
+        module.dump()
diff --git a/test/test_cost.py b/test/test_cost.py
index 76fc4e651f4..2f187c8e3ba 100644
--- a/test/test_cost.py
+++ b/test/test_cost.py
@@ -114,6 +114,7 @@
     PPOLoss,
     QMixerLoss,
     SACLoss,
+    TD3BCLoss,
     TD3Loss,
 )
 from torchrl.objectives.common import LossModule
@@ -261,9 +262,9 @@ def __init__(self):
             self.vmap_model = _vmap_func(
                 self.model,
                 (None, 0),
-                randomness="error"
-                if vmap_randomness == "error"
-                else self.vmap_randomness,
+                randomness=(
+                    "error" if vmap_randomness == "error" else self.vmap_randomness
+                ),
             )
 
         def forward(self, td):
@@ -319,9 +320,9 @@ def _create_mock_actor(
             spec=CompositeSpec(
                 {
                     "action": action_spec,
-                    "action_value"
-                    if action_value_key is None
-                    else action_value_key: None,
+                    (
+                        "action_value" if action_value_key is None else action_value_key
+                    ): None,
                     "chosen_action_value": None,
                 },
                 shape=[],
@@ -2714,6 +2715,729 @@ def test_td3_reduction(self, reduction):
                 assert loss[key].shape == torch.Size([])
 
 
+class TestTD3BC(LossModuleTestBase):
+    seed = 0
+
+    def _create_mock_actor(
+        self,
+        batch=2,
+        obs_dim=3,
+        action_dim=4,
+        device="cpu",
+        in_keys=None,
+        out_keys=None,
+        dropout=0.0,
+    ):
+        # Actor
+        action_spec = BoundedTensorSpec(
+            -torch.ones(action_dim), torch.ones(action_dim), (action_dim,)
+        )
+        module = nn.Sequential(
+            nn.Linear(obs_dim, obs_dim),
+            nn.Dropout(dropout),
+            nn.Linear(obs_dim, action_dim),
+        )
+        actor = Actor(
+            spec=action_spec, module=module, in_keys=in_keys, out_keys=out_keys
+        )
+        return actor.to(device)
+
+    def _create_mock_value(
+        self,
+        batch=2,
+        obs_dim=3,
+        action_dim=4,
+        device="cpu",
+        out_keys=None,
+        action_key="action",
+        observation_key="observation",
+    ):
+        # Actor
+        class ValueClass(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(obs_dim + action_dim, 1)
+
+            def forward(self, obs, act):
+                return self.linear(torch.cat([obs, act], -1))
+
+        module = ValueClass()
+        value = ValueOperator(
+            module=module,
+            in_keys=[observation_key, action_key],
+            out_keys=out_keys,
+        )
+        return value.to(device)
+
+    def _create_mock_distributional_actor(
+        self, batch=2, obs_dim=3, action_dim=4, atoms=5, vmin=1, vmax=5
+    ):
+        raise NotImplementedError
+
+    def _create_mock_common_layer_setup(
+        self, n_obs=3, n_act=4, ncells=4, batch=2, n_hidden=2
+    ):
+        common = MLP(
+            num_cells=ncells,
+            in_features=n_obs,
+            depth=3,
+            out_features=n_hidden,
+        )
+        actor_net = MLP(
+            num_cells=ncells,
+            in_features=n_hidden,
+            depth=1,
+            out_features=2 * n_act,
+        )
+        value = MLP(
+            in_features=n_hidden + n_act,
+            num_cells=ncells,
+            depth=1,
+            out_features=1,
+        )
+        batch = [batch]
+        td = TensorDict(
+            {
+                "obs": torch.randn(*batch, n_obs),
+                "action": torch.randn(*batch, n_act),
+                "done": torch.zeros(*batch, 1, dtype=torch.bool),
+                "terminated": torch.zeros(*batch, 1, dtype=torch.bool),
+                "next": {
+                    "obs": torch.randn(*batch, n_obs),
+                    "reward": torch.randn(*batch, 1),
+                    "done": torch.zeros(*batch, 1, dtype=torch.bool),
+                    "terminated": torch.zeros(*batch, 1, dtype=torch.bool),
+                },
+            },
+            batch,
+        )
+        common = Mod(common, in_keys=["obs"], out_keys=["hidden"])
+        actor = ProbSeq(
+            common,
+            Mod(actor_net, in_keys=["hidden"], out_keys=["param"]),
+            Mod(NormalParamExtractor(), in_keys=["param"], out_keys=["loc", "scale"]),
+            ProbMod(
+                in_keys=["loc", "scale"],
+                out_keys=["action"],
+                distribution_class=TanhNormal,
+                return_log_prob=True,
+            ),
+        )
+        value_head = Mod(
+            value, in_keys=["hidden", "action"], out_keys=["state_action_value"]
+        )
+        value = Seq(common, value_head)
+        return actor, value, common, td
+
+    def _create_mock_data_td3bc(
+        self,
+        batch=8,
+        obs_dim=3,
+        action_dim=4,
+        atoms=None,
+        device="cpu",
+        action_key="action",
+        observation_key="observation",
+        reward_key="reward",
+        done_key="done",
+        terminated_key="terminated",
+    ):
+        # create a tensordict
+        obs = torch.randn(batch, obs_dim, device=device)
+        next_obs = torch.randn(batch, obs_dim, device=device)
+        if atoms:
+            raise NotImplementedError
+        else:
+            action = torch.randn(batch, action_dim, device=device).clamp(-1, 1)
+        reward = torch.randn(batch, 1, device=device)
+        done = torch.zeros(batch, 1, dtype=torch.bool, device=device)
+        terminated = torch.zeros(batch, 1, dtype=torch.bool, device=device)
+        td = TensorDict(
+            batch_size=(batch,),
+            source={
+                observation_key: obs,
+                "next": {
+                    observation_key: next_obs,
+                    done_key: done,
+                    terminated_key: terminated,
+                    reward_key: reward,
+                },
+                action_key: action,
+            },
+            device=device,
+        )
+        return td
+
+    def _create_seq_mock_data_td3bc(
+        self, batch=8, T=4, obs_dim=3, action_dim=4, atoms=None, device="cpu"
+    ):
+        # create a tensordict
+        total_obs = torch.randn(batch, T + 1, obs_dim, device=device)
+        obs = total_obs[:, :T]
+        next_obs = total_obs[:, 1:]
+        if atoms:
+            action = torch.randn(batch, T, atoms, action_dim, device=device).clamp(
+                -1, 1
+            )
+        else:
+            action = torch.randn(batch, T, action_dim, device=device).clamp(-1, 1)
+        reward = torch.randn(batch, T, 1, device=device)
+        done = torch.zeros(batch, T, 1, dtype=torch.bool, device=device)
+        terminated = torch.zeros(batch, T, 1, dtype=torch.bool, device=device)
+        mask = ~torch.zeros(batch, T, 1, dtype=torch.bool, device=device)
+        td = TensorDict(
+            batch_size=(batch, T),
+            source={
+                "observation": obs * mask.to(obs.dtype),
+                "next": {
+                    "observation": next_obs * mask.to(obs.dtype),
+                    "reward": reward * mask.to(obs.dtype),
+                    "done": done,
+                    "terminated": terminated,
+                },
+                "collector": {"mask": mask},
+                "action": action * mask.to(obs.dtype),
+            },
+            names=[None, "time"],
+            device=device,
+        )
+        return td
+
+    @pytest.mark.skipif(not _has_functorch, reason="functorch not installed")
+    @pytest.mark.parametrize("device", get_default_devices())
+    @pytest.mark.parametrize(
+        "delay_actor, delay_qvalue", [(False, False), (True, True)]
+    )
+    @pytest.mark.parametrize("policy_noise", [0.1, 1.0])
+    @pytest.mark.parametrize("noise_clip", [0.1, 1.0])
+    @pytest.mark.parametrize("alpha", [0.1, 6.0])
+    @pytest.mark.parametrize("td_est", list(ValueEstimators) + [None])
+    @pytest.mark.parametrize("use_action_spec", [True, False])
+    @pytest.mark.parametrize("dropout", [0.0, 0.1])
+    def test_td3bc(
+        self,
+        delay_actor,
+        delay_qvalue,
+        device,
+        policy_noise,
+        noise_clip,
+        alpha,
+        td_est,
+        use_action_spec,
+        dropout,
+    ):
+        torch.manual_seed(self.seed)
+        actor = self._create_mock_actor(device=device, dropout=dropout)
+        value = self._create_mock_value(device=device)
+        td = self._create_mock_data_td3bc(device=device)
+        if use_action_spec:
+            action_spec = actor.spec
+            bounds = None
+        else:
+            bounds = (-1, 1)
+            action_spec = None
+        loss_fn = TD3BCLoss(
+            actor,
+            value,
+            action_spec=action_spec,
+            bounds=bounds,
+            loss_function="l2",
+            policy_noise=policy_noise,
+            noise_clip=noise_clip,
+            alpha=alpha,
+            delay_actor=delay_actor,
+            delay_qvalue=delay_qvalue,
+        )
+        if td_est in (ValueEstimators.GAE, ValueEstimators.VTrace):
+            with pytest.raises(NotImplementedError):
+                loss_fn.make_value_estimator(td_est)
+            return
+        if td_est is not None:
+            loss_fn.make_value_estimator(td_est)
+        with (
+            pytest.warns(
+                UserWarning,
+                match="No target network updater has been associated with this loss module",
+            )
+            if (delay_actor or delay_qvalue)
+            else contextlib.nullcontext()
+        ):
+            with _check_td_steady(td):
+                loss = loss_fn(td)
+
+            assert all(
+                (p.grad is None) or (p.grad == 0).all()
+                for p in loss_fn.qvalue_network_params.values(True, True)
+            )
+            assert all(
+                (p.grad is None) or (p.grad == 0).all()
+                for p in loss_fn.actor_network_params.values(True, True)
+            )
+            # check that losses are independent
+            for k in loss.keys():
+                if not k.startswith("loss"):
+                    continue
+                loss[k].sum().backward(retain_graph=True)
+                if k == "loss_actor":
+                    assert all(
+                        (p.grad is None) or (p.grad == 0).all()
+                        for p in loss_fn.qvalue_network_params.values(True, True)
+                    )
+                    assert not any(
+                        (p.grad is None) or (p.grad == 0).all()
+                        for p in loss_fn.actor_network_params.values(True, True)
+                    )
+                elif k == "loss_qvalue":
+                    assert all(
+                        (p.grad is None) or (p.grad == 0).all()
+                        for p in loss_fn.actor_network_params.values(True, True)
+                    )
+                    assert not any(
+                        (p.grad is None) or (p.grad == 0).all()
+                        for p in loss_fn.qvalue_network_params.values(True, True)
+                    )
+                else:
+                    raise NotImplementedError(k)
+                loss_fn.zero_grad()
+
+            sum(
+                [item for name, item in loss.items() if name.startswith("loss_")]
+            ).backward()
+            named_parameters = list(loss_fn.named_parameters())
+            named_buffers = list(loss_fn.named_buffers())
+
+            assert len({p for n, p in named_parameters}) == len(list(named_parameters))
+            assert len({p for n, p in named_buffers}) == len(list(named_buffers))
+
+            for name, p in named_parameters:
+                if not name.startswith("target_"):
+                    assert (
+                        p.grad is not None and p.grad.norm() > 0.0
+                    ), f"parameter {name} (shape: {p.shape}) has a null gradient"
+                else:
+                    assert (
+                        p.grad is None or p.grad.norm() == 0.0
+                    ), f"target parameter {name} (shape: {p.shape}) has a non-null gradient"
+
+    @pytest.mark.skipif(not _has_functorch, reason="functorch not installed")
+    @pytest.mark.parametrize("device", get_default_devices())
+    @pytest.mark.parametrize(
+        "delay_actor, delay_qvalue", [(False, False), (True, True)]
+    )
+    @pytest.mark.parametrize("policy_noise", [0.1])
+    @pytest.mark.parametrize("noise_clip", [0.1])
+    @pytest.mark.parametrize("alpha", [0.1])
+    @pytest.mark.parametrize("use_action_spec", [True, False])
+    def test_td3bc_state_dict(
+        self,
+        delay_actor,
+        delay_qvalue,
+        device,
+        policy_noise,
+        noise_clip,
+        alpha,
+        use_action_spec,
+    ):
+        torch.manual_seed(self.seed)
+        actor = self._create_mock_actor(device=device)
+        value = self._create_mock_value(device=device)
+        if use_action_spec:
+            action_spec = actor.spec
+            bounds = None
+        else:
+            bounds = (-1, 1)
+            action_spec = None
+        loss_fn = TD3BCLoss(
+            actor,
+            value,
+            action_spec=action_spec,
+            bounds=bounds,
+            loss_function="l2",
+            policy_noise=policy_noise,
+            noise_clip=noise_clip,
+            alpha=alpha,
+            delay_actor=delay_actor,
+            delay_qvalue=delay_qvalue,
+        )
+        sd = loss_fn.state_dict()
+        loss_fn2 = TD3BCLoss(
+            actor,
+            value,
+            action_spec=action_spec,
+            bounds=bounds,
+            loss_function="l2",
+            policy_noise=policy_noise,
+            noise_clip=noise_clip,
+            alpha=alpha,
+            delay_actor=delay_actor,
+            delay_qvalue=delay_qvalue,
+        )
+        loss_fn2.load_state_dict(sd)
+
+    @pytest.mark.skipif(not _has_functorch, reason="functorch not installed")
+    @pytest.mark.parametrize("device", get_default_devices())
+    @pytest.mark.parametrize("separate_losses", [False, True])
+    def test_td3bc_separate_losses(
+        self,
+        device,
+        separate_losses,
+        n_act=4,
+    ):
+        torch.manual_seed(self.seed)
+        actor, value, common, td = self._create_mock_common_layer_setup(n_act=n_act)
+        loss_fn = TD3BCLoss(
+            actor,
+            value,
+            action_spec=BoundedTensorSpec(shape=(n_act,), low=-1, high=1),
+            loss_function="l2",
+            separate_losses=separate_losses,
+        )
+        with pytest.warns(UserWarning, match="No target network updater has been"):
+            loss = loss_fn(td)
+
+            assert all(
+                (p.grad is None) or (p.grad == 0).all()
+                for p in loss_fn.qvalue_network_params.values(True, True)
+            )
+            assert all(
+                (p.grad is None) or (p.grad == 0).all()
+                for p in loss_fn.actor_network_params.values(True, True)
+            )
+            # check that losses are independent
+            for k in loss.keys():
+                if not k.startswith("loss"):
+                    continue
+                loss[k].sum().backward(retain_graph=True)
+                if k == "loss_actor":
+                    assert all(
+                        (p.grad is None) or (p.grad == 0).all()
+                        for p in loss_fn.qvalue_network_params.values(True, True)
+                    )
+                    assert not any(
+                        (p.grad is None) or (p.grad == 0).all()
+                        for p in loss_fn.actor_network_params.values(True, True)
+                    )
+                elif k == "loss_qvalue":
+                    assert all(
+                        (p.grad is None) or (p.grad == 0).all()
+                        for p in loss_fn.actor_network_params.values(True, True)
+                    )
+                    if separate_losses:
+                        common_layers_no = len(list(common.parameters()))
+                        common_layers = itertools.islice(
+                            loss_fn.qvalue_network_params.values(True, True),
+                            common_layers_no,
+                        )
+                        assert all(
+                            (p.grad is None) or (p.grad == 0).all()
+                            for p in common_layers
+                        )
+                        qvalue_layers = itertools.islice(
+                            loss_fn.qvalue_network_params.values(True, True),
+                            common_layers_no,
+                            None,
+                        )
+                        assert not any(
+                            (p.grad is None) or (p.grad == 0).all()
+                            for p in qvalue_layers
+                        )
+                    else:
+                        assert not any(
+                            (p.grad is None) or (p.grad == 0).all()
+                            for p in loss_fn.qvalue_network_params.values(True, True)
+                        )
+
+                else:
+                    raise NotImplementedError(k)
+                loss_fn.zero_grad()
+
+    @pytest.mark.skipif(not _has_functorch, reason="functorch not installed")
+    @pytest.mark.parametrize("n", range(1, 4))
+    @pytest.mark.parametrize("device", get_default_devices())
+    @pytest.mark.parametrize("delay_actor,delay_qvalue", [(False, False), (True, True)])
+    @pytest.mark.parametrize("policy_noise", [0.1, 1.0])
+    @pytest.mark.parametrize("noise_clip", [0.1, 1.0])
+    @pytest.mark.parametrize("alpha", [0.1, 6.0])
+    def test_td3bc_batcher(
+        self,
+        n,
+        delay_actor,
+        delay_qvalue,
+        device,
+        policy_noise,
+        noise_clip,
+        alpha,
+        gamma=0.9,
+    ):
+        torch.manual_seed(self.seed)
+        actor = self._create_mock_actor(device=device)
+        value = self._create_mock_value(device=device)
+        td = self._create_seq_mock_data_td3bc(device=device)
+        loss_fn = TD3BCLoss(
+            actor,
+            value,
+            action_spec=actor.spec,
+            policy_noise=policy_noise,
+            noise_clip=noise_clip,
+            alpha=alpha,
+            delay_qvalue=delay_qvalue,
+            delay_actor=delay_actor,
+        )
+
+        ms = MultiStep(gamma=gamma, n_steps=n).to(device)
+
+        td_clone = td.clone()
+        ms_td = ms(td_clone)
+
+        torch.manual_seed(0)
+        np.random.seed(0)
+
+        with (
+            pytest.warns(UserWarning, match="No target network updater has been")
+            if (delay_qvalue or delay_actor)
+            else contextlib.nullcontext()
+        ), _check_td_steady(ms_td):
+            loss_ms = loss_fn(ms_td)
+        assert loss_fn.tensor_keys.priority in ms_td.keys()
+
+        if delay_qvalue or delay_actor:
+            SoftUpdate(loss_fn, eps=0.5)
+
+        with torch.no_grad():
+            torch.manual_seed(0)  # log-prob is computed with a random action
+            np.random.seed(0)
+            loss = loss_fn(td)
+
+        if n == 1:
+            assert_allclose_td(td, ms_td.select(*list(td.keys(True, True))))
+            _loss = sum(
+                [item for name, item in loss.items() if name.startswith("loss_")]
+            )
+            _loss_ms = sum(
+                [item for name, item in loss_ms.items() if name.startswith("loss_")]
+            )
+            assert (
+                abs(_loss - _loss_ms) < 1e-3
+            ), f"found abs(loss-loss_ms) = {abs(loss - loss_ms):4.5f} for n=0"
+        else:
+            with pytest.raises(AssertionError):
+                assert_allclose_td(loss, loss_ms)
+
+        sum(
+            [item for name, item in loss_ms.items() if name.startswith("loss_")]
+        ).backward()
+        named_parameters = loss_fn.named_parameters()
+
+        for name, p in named_parameters:
+            if not name.startswith("target_"):
+                assert (
+                    p.grad is not None and p.grad.norm() > 0.0
+                ), f"parameter {name} (shape: {p.shape}) has a null gradient"
+            else:
+                assert (
+                    p.grad is None or p.grad.norm() == 0.0
+                ), f"target parameter {name} (shape: {p.shape}) has a non-null gradient"
+
+        # Check param update effect on targets
+        target_actor = loss_fn.target_actor_network_params.clone().values(
+            include_nested=True, leaves_only=True
+        )
+        target_qvalue = loss_fn.target_qvalue_network_params.clone().values(
+            include_nested=True, leaves_only=True
+        )
+        for p in loss_fn.parameters():
+            if p.requires_grad:
+                p.data += torch.randn_like(p)
+        target_actor2 = loss_fn.target_actor_network_params.clone().values(
+            include_nested=True, leaves_only=True
+        )
+        target_qvalue2 = loss_fn.target_qvalue_network_params.clone().values(
+            include_nested=True, leaves_only=True
+        )
+        if loss_fn.delay_actor:
+            assert all((p1 == p2).all() for p1, p2 in zip(target_actor, target_actor2))
+        else:
+            assert not any(
+                (p1 == p2).any() for p1, p2 in zip(target_actor, target_actor2)
+            )
+        if loss_fn.delay_qvalue:
+            assert all(
+                (p1 == p2).all() for p1, p2 in zip(target_qvalue, target_qvalue2)
+            )
+        else:
+            assert not any(
+                (p1 == p2).any() for p1, p2 in zip(target_qvalue, target_qvalue2)
+            )
+
+        # check that policy is updated after parameter update
+        actorp_set = set(actor.parameters())
+        loss_fnp_set = set(loss_fn.parameters())
+        assert len(actorp_set.intersection(loss_fnp_set)) == len(actorp_set)
+        parameters = [p.clone() for p in actor.parameters()]
+        for p in loss_fn.parameters():
+            if p.requires_grad:
+                p.data += torch.randn_like(p)
+        assert all((p1 != p2).all() for p1, p2 in zip(parameters, actor.parameters()))
+
+    @pytest.mark.parametrize(
+        "td_est", [ValueEstimators.TD1, ValueEstimators.TD0, ValueEstimators.TDLambda]
+    )
+    def test_td3bc_tensordict_keys(self, td_est):
+        actor = self._create_mock_actor()
+        value = self._create_mock_value()
+        loss_fn = TD3BCLoss(
+            actor,
+            value,
+            action_spec=actor.spec,
+        )
+
+        default_keys = {
+            "priority": "td_error",
+            "state_action_value": "state_action_value",
+            "action": "action",
+            "reward": "reward",
+            "done": "done",
+            "terminated": "terminated",
+        }
+
+        self.tensordict_keys_test(
+            loss_fn,
+            default_keys=default_keys,
+            td_est=td_est,
+        )
+
+        value = self._create_mock_value(out_keys=["state_action_value_test"])
+        loss_fn = TD3BCLoss(
+            actor,
+            value,
+            action_spec=actor.spec,
+        )
+        key_mapping = {
+            "state_action_value": ("value", "state_action_value_test"),
+            "reward": ("reward", "reward_test"),
+            "done": ("done", ("done", "test")),
+            "terminated": ("terminated", ("terminated", "test")),
+        }
+        self.set_advantage_keys_through_loss_test(loss_fn, td_est, key_mapping)
+
+    @pytest.mark.parametrize("spec", [True, False])
+    @pytest.mark.parametrize("bounds", [True, False])
+    def test_constructor(self, spec, bounds):
+        actor = self._create_mock_actor()
+        value = self._create_mock_value()
+        action_spec = actor.spec if spec else None
+        bounds = (-1, 1) if bounds else None
+        if (bounds is not None and action_spec is not None) or (
+            bounds is None and action_spec is None
+        ):
+            with pytest.raises(ValueError, match="but not both"):
+                TD3BCLoss(
+                    actor,
+                    value,
+                    action_spec=action_spec,
+                    bounds=bounds,
+                )
+            return
+        TD3BCLoss(
+            actor,
+            value,
+            action_spec=action_spec,
+            bounds=bounds,
+        )
+
+    # TODO: test for action_key, atm the action key of the TD3+BC loss is not configurable,
+    # since it is used in it's constructor
+    @pytest.mark.parametrize("observation_key", ["observation", "observation2"])
+    @pytest.mark.parametrize("reward_key", ["reward", "reward2"])
+    @pytest.mark.parametrize("done_key", ["done", "done2"])
+    @pytest.mark.parametrize("terminated_key", ["terminated", "terminated2"])
+    def test_td3bc_notensordict(
+        self, observation_key, reward_key, done_key, terminated_key
+    ):
+        torch.manual_seed(self.seed)
+        actor = self._create_mock_actor(in_keys=[observation_key])
+        qvalue = self._create_mock_value(
+            observation_key=observation_key, out_keys=["state_action_value"]
+        )
+        td = self._create_mock_data_td3bc(
+            observation_key=observation_key,
+            reward_key=reward_key,
+            done_key=done_key,
+            terminated_key=terminated_key,
+        )
+        loss = TD3BCLoss(actor, qvalue, action_spec=actor.spec)
+        loss.set_keys(reward=reward_key, done=done_key, terminated=terminated_key)
+
+        kwargs = {
+            observation_key: td.get(observation_key),
+            f"next_{reward_key}": td.get(("next", reward_key)),
+            f"next_{done_key}": td.get(("next", done_key)),
+            f"next_{terminated_key}": td.get(("next", terminated_key)),
+            f"next_{observation_key}": td.get(("next", observation_key)),
+            "action": td.get("action"),
+        }
+        td = TensorDict(kwargs, td.batch_size).unflatten_keys("_")
+
+        with pytest.warns(UserWarning, match="No target network updater has been"):
+            torch.manual_seed(0)
+            loss_val_td = loss(td)
+            torch.manual_seed(0)
+            loss_val = loss(**kwargs)
+            loss_val_reconstruct = TensorDict(dict(zip(loss.out_keys, loss_val)), [])
+            assert_allclose_td(loss_val_reconstruct, loss_val_td)
+
+            # test select
+            loss.select_out_keys("loss_actor", "loss_qvalue")
+            torch.manual_seed(0)
+            if torch.__version__ >= "2.0.0":
+                loss_actor, loss_qvalue = loss(**kwargs)
+            else:
+                with pytest.raises(
+                    RuntimeError,
+                    match="You are likely using tensordict.nn.dispatch with keyword arguments",
+                ):
+                    loss_actor, loss_qvalue = loss(**kwargs)
+                return
+
+            assert loss_actor == loss_val_td["loss_actor"]
+            assert loss_qvalue == loss_val_td["loss_qvalue"]
+
+    @pytest.mark.parametrize("reduction", [None, "none", "mean", "sum"])
+    def test_td3bc_reduction(self, reduction):
+        torch.manual_seed(self.seed)
+        device = (
+            torch.device("cpu")
+            if torch.cuda.device_count() == 0
+            else torch.device("cuda")
+        )
+        actor = self._create_mock_actor(device=device)
+        value = self._create_mock_value(device=device)
+        td = self._create_mock_data_td3bc(device=device)
+        action_spec = actor.spec
+        bounds = None
+        loss_fn = TD3BCLoss(
+            actor,
+            value,
+            action_spec=action_spec,
+            bounds=bounds,
+            loss_function="l2",
+            delay_qvalue=False,
+            delay_actor=False,
+            reduction=reduction,
+        )
+        loss_fn.make_value_estimator()
+        loss = loss_fn(td)
+        if reduction == "none":
+            for key in loss.keys():
+                if key.startswith("loss"):
+                    assert loss[key].shape == td.shape
+        else:
+            for key in loss.keys():
+                if not key.startswith("loss"):
+                    continue
+                assert loss[key].shape == torch.Size([])
+
+
 @pytest.mark.skipif(
     not _has_functorch, reason=f"functorch not installed: {FUNCTORCH_ERR}"
 )
@@ -5686,9 +6410,9 @@ def _create_mock_actor(
             spec=CompositeSpec(
                 {
                     "action": action_spec,
-                    "action_value"
-                    if action_value_key is None
-                    else action_value_key: None,
+                    (
+                        "action_value" if action_value_key is None else action_value_key
+                    ): None,
                     "chosen_action_value": None,
                 },
                 shape=[],
diff --git a/torchrl/objectives/__init__.py b/torchrl/objectives/__init__.py
index f8d2bd1d977..674c06123ad 100644
--- a/torchrl/objectives/__init__.py
+++ b/torchrl/objectives/__init__.py
@@ -17,6 +17,7 @@
 from .reinforce import ReinforceLoss
 from .sac import DiscreteSACLoss, SACLoss
 from .td3 import TD3Loss
+from .td3_bc import TD3BCLoss
 from .utils import (
     default_value_kwargs,
     distance_loss,
diff --git a/torchrl/objectives/td3_bc.py b/torchrl/objectives/td3_bc.py
new file mode 100644
index 00000000000..93845bb00bd
--- /dev/null
+++ b/torchrl/objectives/td3_bc.py
@@ -0,0 +1,571 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+
+from tensordict import TensorDict, TensorDictBase, TensorDictParams
+from tensordict.nn import dispatch, TensorDictModule
+from tensordict.utils import NestedKey
+from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec, TensorSpec
+
+from torchrl.envs.utils import step_mdp
+from torchrl.objectives.common import LossModule
+
+from torchrl.objectives.utils import (
+    _cache_values,
+    _reduce,
+    _vmap_func,
+    default_value_kwargs,
+    distance_loss,
+    ValueEstimators,
+)
+from torchrl.objectives.value import TD0Estimator, TD1Estimator, TDLambdaEstimator
+
+
+class TD3BCLoss(LossModule):
+    r"""TD3+BC Loss Module.
+
+    Implementation of the TD3+BC loss presented in the paper `"A Minimalist Approach to
+    Offline Reinforcement Learning" <https://arxiv.org/pdf/2106.06860>`.
+
+    This class incorporates two loss functions, executed sequentially within the `forward` method:
+
+    1. :meth:`~.qvalue_loss`
+    2. :meth:`~.actor_loss`
+
+    Users also have the option to call these functions directly in the same order if preferred.
+
+    Args:
+        actor_network (TensorDictModule): the actor to be trained
+        qvalue_network (TensorDictModule): a single Q-value network that will
+            be multiplicated as many times as needed.
+
+    Keyword Args:
+        bounds (tuple of float, optional): the bounds of the action space.
+             Exclusive with ``action_spec``. Either this or ``action_spec`` must
+            be provided.
+        action_spec (TensorSpec, optional): the action spec.
+            Exclusive with ``bounds``. Either this or ``bounds`` must be provided.
+        num_qvalue_nets (int, optional): Number of Q-value networks to be
+            trained. Default is ``2``.
+        policy_noise (float, optional): Standard deviation for the target
+            policy action noise. Default is ``0.2``.
+        noise_clip (float, optional): Clipping range value for the sampled
+            target policy action noise. Default is ``0.5``.
+        alpha (float, optional): Weight for the behavioral cloning loss.
+            Defaults to ``2.5``.
+        priority_key (str, optional): Key where to write the priority value
+            for prioritized replay buffers. Default is
+            `"td_error"`.
+        loss_function (str, optional): loss function to be used for the Q-value.
+            Can be one of  ``"smooth_l1"``, ``"l2"``,
+            ``"l1"``, Default is ``"smooth_l1"``.
+        delay_actor (bool, optional): whether to separate the target actor
+            networks from the actor networks used for
+            data collection. Default is ``True``.
+        delay_qvalue (bool, optional): Whether to separate the target Q value
+            networks from the Q value networks used
+            for data collection. Default is ``True``.
+        spec (TensorSpec, optional): the action tensor spec. If not provided
+            and the target entropy is ``"auto"``, it will be retrieved from
+            the actor.
+        separate_losses (bool, optional): if ``True``, shared parameters between
+            policy and critic will only be trained on the policy loss.
+            Defaults to ``False``, ie. gradients are propagated to shared
+            parameters for both policy and critic losses.
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied,
+            ``"mean"``: the sum of the output will be divided by the number of
+            elements in the output, ``"sum"``: the output will be summed. Default: ``"mean"``.
+
+    Examples:
+        >>> import torch
+        >>> from torch import nn
+        >>> from torchrl.data import BoundedTensorSpec
+        >>> from torchrl.modules.distributions.continuous import NormalParamWrapper, TanhNormal
+        >>> from torchrl.modules.tensordict_module.actors import Actor, ProbabilisticActor, ValueOperator
+        >>> from torchrl.modules.tensordict_module.common import SafeModule
+        >>> from torchrl.objectives.td3_bc import TD3BCLoss
+        >>> from tensordict import TensorDict
+        >>> n_act, n_obs = 4, 3
+        >>> spec = BoundedTensorSpec(-torch.ones(n_act), torch.ones(n_act), (n_act,))
+        >>> module = nn.Linear(n_obs, n_act)
+        >>> actor = Actor(
+        ...     module=module,
+        ...     spec=spec)
+        >>> class ValueClass(nn.Module):
+        ...     def __init__(self):
+        ...         super().__init__()
+        ...         self.linear = nn.Linear(n_obs + n_act, 1)
+        ...     def forward(self, obs, act):
+        ...         return self.linear(torch.cat([obs, act], -1))
+        >>> module = ValueClass()
+        >>> qvalue = ValueOperator(
+        ...     module=module,
+        ...     in_keys=['observation', 'action'])
+        >>> loss = TD3BCLoss(actor, qvalue, action_spec=actor.spec)
+        >>> batch = [2, ]
+        >>> action = spec.rand(batch)
+        >>> data = TensorDict({
+        ...      "observation": torch.randn(*batch, n_obs),
+        ...      "action": action,
+        ...      ("next", "done"): torch.zeros(*batch, 1, dtype=torch.bool),
+        ...      ("next", "terminated"): torch.zeros(*batch, 1, dtype=torch.bool),
+        ...      ("next", "reward"): torch.randn(*batch, 1),
+        ...      ("next", "observation"): torch.randn(*batch, n_obs),
+        ...  }, batch)
+        >>> loss(data)
+        TensorDict(
+            fields={
+                bc_loss: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
+                lmbd: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
+                loss_actor: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
+                loss_qvalue: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
+                next_state_value: Tensor(shape=torch.Size([2]), device=cpu, dtype=torch.float32, is_shared=False),
+                pred_value: Tensor(shape=torch.Size([2, 2]), device=cpu, dtype=torch.float32, is_shared=False),
+                state_action_value_actor: Tensor(shape=torch.Size([2, 2]), device=cpu, dtype=torch.float32, is_shared=False),
+                target_value: Tensor(shape=torch.Size([2]), device=cpu, dtype=torch.float32, is_shared=False)},
+            batch_size=torch.Size([]),
+            device=None,
+            is_shared=False)
+
+    This class is compatible with non-tensordict based modules too and can be
+    used without recurring to any tensordict-related primitive. In this case,
+    the expected keyword arguments are:
+    ``["action", "next_reward", "next_done", "next_terminated"]`` + in_keys of the actor and qvalue network
+    The return value is a tuple of tensors in the following order:
+    ``["loss_actor", "loss_qvalue", "bc_loss, "lmbd", "pred_value", "state_action_value_actor", "next_state_value", "target_value",]``.
+
+    Examples:
+        >>> import torch
+        >>> from torch import nn
+        >>> from torchrl.data import BoundedTensorSpec
+        >>> from torchrl.modules.tensordict_module.actors import Actor, ValueOperator
+        >>> from torchrl.objectives.td3_bc import TD3BCLoss
+        >>> n_act, n_obs = 4, 3
+        >>> spec = BoundedTensorSpec(-torch.ones(n_act), torch.ones(n_act), (n_act,))
+        >>> module = nn.Linear(n_obs, n_act)
+        >>> actor = Actor(
+        ...     module=module,
+        ...     spec=spec)
+        >>> class ValueClass(nn.Module):
+        ...     def __init__(self):
+        ...         super().__init__()
+        ...         self.linear = nn.Linear(n_obs + n_act, 1)
+        ...     def forward(self, obs, act):
+        ...         return self.linear(torch.cat([obs, act], -1))
+        >>> module = ValueClass()
+        >>> qvalue = ValueOperator(
+        ...     module=module,
+        ...     in_keys=['observation', 'action'])
+        >>> loss = TD3BCLoss(actor, qvalue, action_spec=actor.spec)
+        >>> _ = loss.select_out_keys("loss_actor", "loss_qvalue")
+        >>> batch = [2, ]
+        >>> action = spec.rand(batch)
+        >>> loss_actor, loss_qvalue = loss(
+        ...         observation=torch.randn(*batch, n_obs),
+        ...         action=action,
+        ...         next_done=torch.zeros(*batch, 1, dtype=torch.bool),
+        ...         next_terminated=torch.zeros(*batch, 1, dtype=torch.bool),
+        ...         next_reward=torch.randn(*batch, 1),
+        ...         next_observation=torch.randn(*batch, n_obs))
+        >>> loss_actor.backward()
+
+    """
+
+    @dataclass
+    class _AcceptedKeys:
+        """Maintains default values for all configurable tensordict keys.
+
+        This class defines which tensordict keys can be set using '.set_keys(key_name=key_value)' and their
+        default values.
+
+        Attributes:
+            action (NestedKey): The input tensordict key where the action is expected.
+                Defaults to ``"action"``.
+            state_action_value (NestedKey): The input tensordict key where the state action value is expected.
+                Will be used for the underlying value estimator. Defaults to ``"state_action_value"``.
+            priority (NestedKey): The input tensordict key where the target priority is written to.
+                Defaults to ``"td_error"``.
+            reward (NestedKey): The input tensordict key where the reward is expected.
+                Will be used for the underlying value estimator. Defaults to ``"reward"``.
+            done (NestedKey): The key in the input TensorDict that indicates
+                whether a trajectory is done. Will be used for the underlying value estimator.
+                Defaults to ``"done"``.
+            terminated (NestedKey): The key in the input TensorDict that indicates
+                whether a trajectory is terminated. Will be used for the underlying value estimator.
+                Defaults to ``"terminated"``.
+        """
+
+        action: NestedKey = "action"
+        state_action_value: NestedKey = "state_action_value"
+        priority: NestedKey = "td_error"
+        reward: NestedKey = "reward"
+        done: NestedKey = "done"
+        terminated: NestedKey = "terminated"
+
+    default_keys = _AcceptedKeys()
+    default_value_estimator = ValueEstimators.TD0
+    out_keys = [
+        "loss_actor",
+        "loss_qvalue",
+        "bc_loss",
+        "lmbd",
+        "pred_value",
+        "state_action_value_actor",
+        "next_state_value",
+        "target_value",
+    ]
+
+    actor_network: TensorDictModule
+    qvalue_network: TensorDictModule
+    actor_network_params: TensorDictParams
+    qvalue_network_params: TensorDictParams
+    target_actor_network_params: TensorDictParams
+    target_qvalue_network_params: TensorDictParams
+
+    def __init__(
+        self,
+        actor_network: TensorDictModule,
+        qvalue_network: TensorDictModule,
+        *,
+        action_spec: TensorSpec = None,
+        bounds: Optional[Tuple[float]] = None,
+        num_qvalue_nets: int = 2,
+        policy_noise: float = 0.2,
+        noise_clip: float = 0.5,
+        alpha: float = 2.5,
+        loss_function: str = "smooth_l1",
+        delay_actor: bool = True,
+        delay_qvalue: bool = True,
+        priority_key: str = None,
+        separate_losses: bool = False,
+        reduction: str = None,
+    ) -> None:
+        if reduction is None:
+            reduction = "mean"
+        super().__init__()
+        self._in_keys = None
+        self._set_deprecated_ctor_keys(priority=priority_key)
+
+        self.delay_actor = delay_actor
+        self.delay_qvalue = delay_qvalue
+
+        self.convert_to_functional(
+            actor_network,
+            "actor_network",
+            create_target_params=self.delay_actor,
+        )
+        if separate_losses:
+            # we want to make sure there are no duplicates in the params: the
+            # params of critic must be refs to actor if they're shared
+            policy_params = list(actor_network.parameters())
+        else:
+            policy_params = None
+        self.convert_to_functional(
+            qvalue_network,
+            "qvalue_network",
+            num_qvalue_nets,
+            create_target_params=self.delay_qvalue,
+            compare_against=policy_params,
+        )
+
+        for p in self.parameters():
+            device = p.device
+            break
+        else:
+            device = None
+        self.num_qvalue_nets = num_qvalue_nets
+        self.loss_function = loss_function
+        self.policy_noise = policy_noise
+        self.noise_clip = noise_clip
+        self.alpha = alpha
+        if not ((action_spec is not None) ^ (bounds is not None)):
+            raise ValueError(
+                "One of 'bounds' and 'action_spec' must be provided, "
+                f"but not both or none. Got bounds={bounds} and action_spec={action_spec}."
+            )
+        elif action_spec is not None:
+            if isinstance(action_spec, CompositeSpec):
+                if (
+                    isinstance(self.tensor_keys.action, tuple)
+                    and len(self.tensor_keys.action) > 1
+                ):
+                    action_container_shape = action_spec[
+                        self.tensor_keys.action[:-1]
+                    ].shape
+                else:
+                    action_container_shape = action_spec.shape
+                action_spec = action_spec[self.tensor_keys.action][
+                    (0,) * len(action_container_shape)
+                ]
+            if not isinstance(action_spec, BoundedTensorSpec):
+                raise ValueError(
+                    f"action_spec is not of type BoundedTensorSpec but {type(action_spec)}."
+                )
+            low = action_spec.space.low
+            high = action_spec.space.high
+        else:
+            low, high = bounds
+        if not isinstance(low, torch.Tensor):
+            low = torch.tensor(low)
+        if not isinstance(high, torch.Tensor):
+            high = torch.tensor(high, device=low.device, dtype=low.dtype)
+        if (low > high).any():
+            raise ValueError("Got a low bound higher than a high bound.")
+        if device is not None:
+            low = low.to(device)
+            high = high.to(device)
+        self.register_buffer("max_action", high)
+        self.register_buffer("min_action", low)
+        self._vmap_qvalue_network00 = _vmap_func(
+            self.qvalue_network, randomness=self.vmap_randomness
+        )
+        self._vmap_actor_network00 = _vmap_func(
+            self.actor_network, randomness=self.vmap_randomness
+        )
+        self.reduction = reduction
+
+    def _forward_value_estimator_keys(self, **kwargs) -> None:
+        if self._value_estimator is not None:
+            self._value_estimator.set_keys(
+                value=self._tensor_keys.state_action_value,
+                reward=self.tensor_keys.reward,
+                done=self.tensor_keys.done,
+                terminated=self.tensor_keys.terminated,
+            )
+        self._set_in_keys()
+
+    def _set_in_keys(self):
+        keys = [
+            self.tensor_keys.action,
+            ("next", self.tensor_keys.reward),
+            ("next", self.tensor_keys.done),
+            ("next", self.tensor_keys.terminated),
+            *self.actor_network.in_keys,
+            *[("next", key) for key in self.actor_network.in_keys],
+            *self.qvalue_network.in_keys,
+        ]
+        self._in_keys = list(set(keys))
+
+    @property
+    def in_keys(self):
+        if self._in_keys is None:
+            self._set_in_keys()
+        return self._in_keys
+
+    @in_keys.setter
+    def in_keys(self, values):
+        self._in_keys = values
+
+    @property
+    @_cache_values
+    def _cached_detach_qvalue_network_params(self):
+        return self.qvalue_network_params.detach()
+
+    @property
+    @_cache_values
+    def _cached_stack_actor_params(self):
+        return torch.stack(
+            [self.actor_network_params, self.target_actor_network_params], 0
+        )
+
+    def actor_loss(self, tensordict):
+        """Compute the actor loss.
+
+        The actor loss should be computed after the :meth:`~.qvalue_loss` and is usually delayed 1-3 critic updates.
+
+        Args:
+            tensordict (TensorDictBase): the input data for the loss. Check the class's `in_keys` to see what fields
+                are required for this to be computed.
+        Returns: a differentiable tensor with the actor loss along with a metadata dictionary containing the detached `"bc_loss"`
+                used in the combined actor loss as well as the detached `"state_action_value_actor"` used to calculate the lambda
+                value, and the lambda value `"lmbd"` itself.
+        """
+        tensordict_actor_grad = tensordict.select(
+            *self.actor_network.in_keys, strict=False
+        )
+        with self.actor_network_params.to_module(self.actor_network):
+            tensordict_actor_grad = self.actor_network(tensordict_actor_grad)
+        actor_loss_td = tensordict_actor_grad.select(
+            *self.qvalue_network.in_keys, strict=False
+        ).expand(
+            self.num_qvalue_nets, *tensordict_actor_grad.batch_size
+        )  # for actor loss
+        state_action_value_actor = (
+            self._vmap_qvalue_network00(
+                actor_loss_td,
+                self._cached_detach_qvalue_network_params,
+            )
+            .get(self.tensor_keys.state_action_value)
+            .squeeze(-1)
+        )
+
+        bc_loss = torch.nn.functional.mse_loss(
+            tensordict_actor_grad.get(self.tensor_keys.action),
+            tensordict.get(self.tensor_keys.action),
+        )
+        lmbd = self.alpha / state_action_value_actor[0].abs().mean().detach()
+
+        loss_actor = -lmbd * state_action_value_actor[0] + bc_loss
+
+        metadata = {
+            "state_action_value_actor": state_action_value_actor[0].detach(),
+            "bc_loss": bc_loss.detach(),
+            "lmbd": lmbd,
+        }
+        loss_actor = _reduce(loss_actor, reduction=self.reduction)
+        return loss_actor, metadata
+
+    def qvalue_loss(self, tensordict):
+        """Compute the q-value loss.
+
+        The q-value loss should be computed before the :meth:`~.actor_loss`.
+
+        Args:
+            tensordict (TensorDictBase): the input data for the loss. Check the class's `in_keys` to see what fields
+                are required for this to be computed.
+        Returns: a differentiable tensor with the qvalue loss along with a metadata dictionary containing
+            the detached `"td_error"` to be used for prioritized sampling, the detached `"next_state_value"`, the detached `"pred_value"`, and the detached `"target_value"`.
+        """
+        tensordict = tensordict.clone(False)
+
+        act = tensordict.get(self.tensor_keys.action)
+
+        # computing early for reprod
+        noise = (torch.randn_like(act) * self.policy_noise).clamp(
+            -self.noise_clip, self.noise_clip
+        )
+
+        with torch.no_grad():
+            next_td_actor = step_mdp(tensordict).select(
+                *self.actor_network.in_keys, strict=False
+            )  # next_observation ->
+            with self.target_actor_network_params.to_module(self.actor_network):
+                next_td_actor = self.actor_network(next_td_actor)
+            next_action = (next_td_actor.get(self.tensor_keys.action) + noise).clamp(
+                self.min_action, self.max_action
+            )
+            next_td_actor.set(
+                self.tensor_keys.action,
+                next_action,
+            )
+            next_val_td = next_td_actor.select(
+                *self.qvalue_network.in_keys, strict=False
+            ).expand(
+                self.num_qvalue_nets, *next_td_actor.batch_size
+            )  # for next value estimation
+            next_target_q1q2 = (
+                self._vmap_qvalue_network00(
+                    next_val_td,
+                    self.target_qvalue_network_params,
+                )
+                .get(self.tensor_keys.state_action_value)
+                .squeeze(-1)
+            )
+        # min over the next target qvalues
+        next_target_qvalue = next_target_q1q2.min(0)[0]
+
+        # set next target qvalues
+        tensordict.set(
+            ("next", self.tensor_keys.state_action_value),
+            next_target_qvalue.unsqueeze(-1),
+        )
+
+        qval_td = tensordict.select(*self.qvalue_network.in_keys, strict=False).expand(
+            self.num_qvalue_nets,
+            *tensordict.batch_size,
+        )
+        # preditcted current qvalues
+        current_qvalue = (
+            self._vmap_qvalue_network00(
+                qval_td,
+                self.qvalue_network_params,
+            )
+            .get(self.tensor_keys.state_action_value)
+            .squeeze(-1)
+        )
+
+        # compute target values for the qvalue loss (reward + gamma * next_target_qvalue * (1 - done))
+        target_value = self.value_estimator.value_estimate(tensordict).squeeze(-1)
+
+        td_error = (current_qvalue - target_value).pow(2)
+        loss_qval = distance_loss(
+            current_qvalue,
+            target_value.expand_as(current_qvalue),
+            loss_function=self.loss_function,
+        ).sum(0)
+        metadata = {
+            "td_error": td_error,
+            "next_state_value": next_target_qvalue.detach(),
+            "pred_value": current_qvalue.detach(),
+            "target_value": target_value.detach(),
+        }
+        loss_qval = _reduce(loss_qval, reduction=self.reduction)
+        return loss_qval, metadata
+
+    @dispatch
+    def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
+        """The forward method.
+
+        Computes successively the  :meth:`~.actor_loss`, :meth:`~.qvalue_loss`, and returns
+        a tensordict with these values.
+        To see what keys are expected in the input tensordict and what keys are expected as output, check the
+        class's `"in_keys"` and `"out_keys"` attributes.
+        """
+        tensordict_save = tensordict
+        loss_actor, metadata_actor = self.actor_loss(tensordict)
+        loss_qval, metadata_value = self.qvalue_loss(tensordict_save)
+        tensordict_save.set(
+            self.tensor_keys.priority, metadata_value.pop("td_error").detach().max(0)[0]
+        )
+        if not loss_qval.shape == loss_actor.shape:
+            raise RuntimeError(
+                f"QVal and actor loss have different shape: {loss_qval.shape} and {loss_actor.shape}"
+            )
+        td_out = TensorDict(
+            source={
+                "loss_actor": loss_actor,
+                "loss_qvalue": loss_qval,
+                **metadata_actor,
+                **metadata_value,
+            },
+            batch_size=[],
+        )
+        return td_out
+
+    def make_value_estimator(self, value_type: ValueEstimators = None, **hyperparams):
+        if value_type is None:
+            value_type = self.default_value_estimator
+        self.value_type = value_type
+        hp = dict(default_value_kwargs(value_type))
+        if hasattr(self, "gamma"):
+            hp["gamma"] = self.gamma
+        hp.update(hyperparams)
+        # we do not need a value network bc the next state value is already passed
+        if value_type == ValueEstimators.TD1:
+            self._value_estimator = TD1Estimator(value_network=None, **hp)
+        elif value_type == ValueEstimators.TD0:
+            self._value_estimator = TD0Estimator(value_network=None, **hp)
+        elif value_type == ValueEstimators.GAE:
+            raise NotImplementedError(
+                f"Value type {value_type} it not implemented for loss {type(self)}."
+            )
+        elif value_type == ValueEstimators.TDLambda:
+            self._value_estimator = TDLambdaEstimator(value_network=None, **hp)
+        else:
+            raise NotImplementedError(f"Unknown value type {value_type}")
+
+        tensor_keys = {
+            "value": self.tensor_keys.state_action_value,
+            "reward": self.tensor_keys.reward,
+            "done": self.tensor_keys.done,
+            "terminated": self.tensor_keys.terminated,
+        }
+        self._value_estimator.set_keys(**tensor_keys)