diff --git a/docs/source/examples/basic_usage.rst b/docs/source/examples/basic_usage.rst
index 6abfb059..4a883ee1 100644
--- a/docs/source/examples/basic_usage.rst
+++ b/docs/source/examples/basic_usage.rst
@@ -12,21 +12,27 @@ the parameters are updated using the resulting aggregation.
 
 Import several classes from ``torch`` and ``torchjd``:
 
->>> import torch
->>> from torch.nn import MSELoss, Sequential, Linear, ReLU
->>> from torch.optim import SGD
->>>
->>> import torchjd
->>> from torchjd.aggregation import UPGrad
+.. code-block:: python
+
+    import torch
+    from torch.nn import MSELoss, Sequential, Linear, ReLU
+    from torch.optim import SGD
+
+    import torchjd
+    from torchjd.aggregation import UPGrad
 
 Define the model and the optimizer, as usual:
 
->>> model = Sequential(Linear(10, 5), ReLU(), Linear(5, 2))
->>> optimizer = SGD(model.parameters(), lr=0.1)
+.. code-block:: python
+
+    model = Sequential(Linear(10, 5), ReLU(), Linear(5, 2))
+    optimizer = SGD(model.parameters(), lr=0.1)
 
 Define the aggregator that will be used to combine the Jacobian matrix:
 
->>> A = UPGrad()
+.. code-block:: python
+
+    A = UPGrad()
 
 In essence, :doc:`UPGrad <../docs/aggregation/upgrad>` projects each gradient onto the dual cone of
 the rows of the Jacobian and averages the results. This ensures that locally, no loss will be
@@ -34,34 +40,44 @@ negatively affected by the update.
 
 Now that everything is defined, we can train the model. Define the input and the associated target:
 
->>> input = torch.randn(16, 10)  # Batch of 16 random input vectors of length 10
->>> target1 = torch.randn(16)  # First batch of 16 targets
->>> target2 = torch.randn(16)  # Second batch of 16 targets
+.. code-block:: python
+
+    input = torch.randn(16, 10)  # Batch of 16 random input vectors of length 10
+    target1 = torch.randn(16)  # First batch of 16 targets
+    target2 = torch.randn(16)  # Second batch of 16 targets
 
 Here, we generate fake inputs and labels for the sake of the example.
 
 We can now compute the losses associated to each element of the batch.
 
->>> loss_fn = MSELoss()
->>> output = model(input)
->>> loss1 = loss_fn(output[:, 0], target1)
->>> loss2 = loss_fn(output[:, 1], target2)
+.. code-block:: python
+
+    loss_fn = MSELoss()
+    output = model(input)
+    loss1 = loss_fn(output[:, 0], target1)
+    loss2 = loss_fn(output[:, 1], target2)
 
 The last steps are similar to gradient descent-based optimization, but using the two losses.
 
 Reset the ``.grad`` field of each model parameter:
 
->>> optimizer.zero_grad()
+.. code-block:: python
+
+    optimizer.zero_grad()
 
 Perform the Jacobian descent backward pass:
 
->>> torchjd.backward([loss1, loss2], model.parameters(), A)
+.. code-block:: python
+
+    torchjd.backward([loss1, loss2], model.parameters(), A)
 
 This will populate the ``.grad`` field of each model parameter with the corresponding aggregated
 Jacobian matrix.
 
 Update each parameter based on its ``.grad`` field, using the ``optimizer``:
 
->>> optimizer.step()
+.. code-block:: python
+
+    optimizer.step()
 
 The model's parameters have been updated!
diff --git a/docs/source/examples/iwrm.rst b/docs/source/examples/iwrm.rst
index a7c31dec..c624891c 100644
--- a/docs/source/examples/iwrm.rst
+++ b/docs/source/examples/iwrm.rst
@@ -66,7 +66,7 @@ each Jacobian matrix consists of one gradient per loss. In this example, we use
         IWRM with SSJD
         ^^^^^^^^^^^^^^
         .. code-block:: python
-            :emphasize-lines: 10, 11, 21, 25, 29, 31
+            :emphasize-lines: 10-11, 21, 25, 29, 31
 
             import torch
             from torch.nn import (
diff --git a/docs/source/examples/mtl.rst b/docs/source/examples/mtl.rst
index b194a381..86a9d83a 100644
--- a/docs/source/examples/mtl.rst
+++ b/docs/source/examples/mtl.rst
@@ -17,46 +17,50 @@ example shows how to use TorchJD to train a very simple multi-task model with tw
 For the sake of the example, we generate a fake dataset consisting of 8 batches of 16 random input
 vectors of dimension 10, and their corresponding scalar labels for both tasks.
 
->>> import torch
->>> from torch.nn import Linear, MSELoss, ReLU, Sequential
->>> from torch.optim import SGD
->>>
->>> from torchjd import mtl_backward
->>> from torchjd.aggregation import UPGrad
->>>
->>> shared_module = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
->>> task1_module = Linear(3, 1)
->>> task2_module = Linear(3, 1)
->>> params = [
->>>     *shared_module.parameters(),
->>>     *task1_module.parameters(),
->>>     *task2_module.parameters(),
->>> ]
->>>
->>> loss_fn = MSELoss()
->>> optimizer = SGD(params, lr=0.1)
->>> A = UPGrad()
->>>
->>> inputs = torch.randn(8, 16, 10)  # 8 batches of 16 random input vectors of length 10
->>> task1_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the first task
->>> task2_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the second task
->>>
->>> for input, target1, target2 in zip(inputs, task1_targets, task2_targets):
->>>     features = shared_module(input)
->>>     output1 = task1_module(features)
->>>     output2 = task2_module(features)
->>>     loss1 = loss_fn(output1, target1)
->>>     loss2 = loss_fn(output2, target2)
->>>
->>>     optimizer.zero_grad()
->>>     mtl_backward(
-...         losses=[loss1, loss2],
-...         features=features,
-...         tasks_params=[task1_module.parameters(), task2_module.parameters()],
-...         shared_params=shared_module.parameters(),
-...         A=A,
-...     )
->>>     optimizer.step()
+
+.. code-block:: python
+    :emphasize-lines: 5-6, 19, 33-39
+
+    import torch
+    from torch.nn import Linear, MSELoss, ReLU, Sequential
+    from torch.optim import SGD
+
+    from torchjd import mtl_backward
+    from torchjd.aggregation import UPGrad
+
+    shared_module = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
+    task1_module = Linear(3, 1)
+    task2_module = Linear(3, 1)
+    params = [
+        *shared_module.parameters(),
+        *task1_module.parameters(),
+        *task2_module.parameters(),
+    ]
+
+    loss_fn = MSELoss()
+    optimizer = SGD(params, lr=0.1)
+    A = UPGrad()
+
+    inputs = torch.randn(8, 16, 10)  # 8 batches of 16 random input vectors of length 10
+    task1_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the first task
+    task2_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the second task
+
+    for input, target1, target2 in zip(inputs, task1_targets, task2_targets):
+        features = shared_module(input)
+        output1 = task1_module(features)
+        output2 = task2_module(features)
+        loss1 = loss_fn(output1, target1)
+        loss2 = loss_fn(output2, target2)
+
+        optimizer.zero_grad()
+        mtl_backward(
+            losses=[loss1, loss2],
+            features=features,
+            tasks_params=[task1_module.parameters(), task2_module.parameters()],
+            shared_params=shared_module.parameters(),
+            A=A,
+        )
+        optimizer.step()
 
 .. note::
     In this example, the Jacobian is only with respect to the shared parameters. The task-specific