From f73b7ae4f37cef98a57a898091898dbe945db584 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexandre=20D=C3=A9fossez?=
 <adefossez@users.noreply.github.com>
Date: Thu, 26 Oct 2023 17:58:04 +0200
Subject: [PATCH 1/3] Many bug fixes for v1.1.0 (#335)

* wip

* fix bug in sample storage

* fix bug in fsdp with pytorch 2.0.1

* fix kld

* revert back passt change

* fix n_quantizers

* changes

* adding warning

* extra warnings and tests

* missing changelog

* missing changes between audiogen and musicgen
---
 CHANGELOG.md                             | 18 +++++++++++++++++-
 audiocraft/__init__.py                   |  2 +-
 audiocraft/models/audiogen.py            |  4 ++++
 audiocraft/models/builders.py            |  5 ++---
 audiocraft/models/encodec.py             |  2 +-
 audiocraft/models/lm.py                  |  8 +++++---
 audiocraft/models/musicgen.py            |  4 ++++
 audiocraft/modules/codebooks_patterns.py | 11 ++++++++---
 audiocraft/modules/conditioners.py       |  2 ++
 audiocraft/optim/fsdp.py                 |  4 ++--
 audiocraft/solvers/musicgen.py           | 10 ++++++++--
 audiocraft/utils/cache.py                |  2 +-
 config/conditioner/clapemb2music.yaml    |  2 +-
 config/model/lm/audiogen_lm.yaml         |  2 +-
 config/model/lm/musicgen_lm.yaml         |  2 +-
 docs/MUSICGEN.md                         |  4 ++++
 tests/models/test_musicgen.py            |  7 +++++++
 17 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 01026a6f..b8de9f7e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,10 +5,26 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 
-## [1.0.1] - TBD
+## [1.1.0a] - TBD
 
 Not using torchaudio anymore when writing audio files, relying instead directly on the commandline ffmpeg. Also not using it anymore for reading audio files, for similar reasons.
 
+Fixed DAC support with non default number of codebooks.
+
+Fixed bug when `two_step_cfg` was overriden when calling `generate()`.
+
+Fixed samples being always prompted with audio, rather than having both prompted and unprompted.
+
+**Backward incompatible change:** A `torch.no_grad` around the computation of the conditioning made its way in the public release.
+	The released models were trained without this. Those impact linear layers applied to the output of the T5 or melody conditioners.
+	We removed it, so you might need to retrain models.
+
+**Backward incompatible change:** Fixing wrong sample rate in CLAP (WARNING if you trained model with CLAP before).
+
+**Backward incompatible change:** Renamed VALLEPattern to CoarseFirstPattern, as it was wrongly named. Probably no one
+	retrained a model with this pattern, so hopefully this won't impact you!
+
+
 ## [1.0.0] - 2023-09-07
 
 Major revision, added training code for EnCodec, AudioGen, MusicGen, and MultiBandDiffusion.
diff --git a/audiocraft/__init__.py b/audiocraft/__init__.py
index 6ab34607..130190f1 100644
--- a/audiocraft/__init__.py
+++ b/audiocraft/__init__.py
@@ -23,4 +23,4 @@
 # flake8: noqa
 from . import data, modules, models
 
-__version__ = '1.0.0'
+__version__ = '1.1.0a1'
diff --git a/audiocraft/models/audiogen.py b/audiocraft/models/audiogen.py
index 5cb88998..b4df536e 100644
--- a/audiocraft/models/audiogen.py
+++ b/audiocraft/models/audiogen.py
@@ -38,6 +38,10 @@ def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
         self.name = name
         self.compression_model = compression_model
         self.lm = lm
+        # Just to be safe, let's put everything in eval mode.
+        self.compression_model.eval()
+        self.lm.eval()
+
         if max_duration is None:
             if hasattr(lm, 'cfg'):
                 max_duration = lm.cfg.dataset.segment_duration  # type: ignore
diff --git a/audiocraft/models/builders.py b/audiocraft/models/builders.py
index 038bf99c..272ce1b3 100644
--- a/audiocraft/models/builders.py
+++ b/audiocraft/models/builders.py
@@ -23,7 +23,7 @@
     MusicLMPattern,
     ParallelPatternProvider,
     UnrolledPatternProvider,
-    VALLEPattern,
+    CoarseFirstPattern,
 )
 from ..modules.conditioners import (
     BaseConditioner,
@@ -172,7 +172,7 @@ def get_codebooks_pattern_provider(n_q: int, cfg: omegaconf.DictConfig) -> Codeb
         'parallel': ParallelPatternProvider,
         'delay': DelayedPatternProvider,
         'unroll': UnrolledPatternProvider,
-        'valle': VALLEPattern,
+        'coarse_first': CoarseFirstPattern,
         'musiclm': MusicLMPattern,
     }
     name = cfg.modeling
@@ -196,7 +196,6 @@ def get_debug_compression_model(device='cpu', sample_rate: int = 32000):
         'dimension': 32,
         'ratios': ratios,
     }
-    print(seanet_kwargs)
     encoder = audiocraft.modules.SEANetEncoder(**seanet_kwargs)
     decoder = audiocraft.modules.SEANetDecoder(**seanet_kwargs)
     quantizer = qt.ResidualVectorQuantizer(dimension=32, bins=400, n_q=4)
diff --git a/audiocraft/models/encodec.py b/audiocraft/models/encodec.py
index 40d13301..a3bd9274 100644
--- a/audiocraft/models/encodec.py
+++ b/audiocraft/models/encodec.py
@@ -276,7 +276,7 @@ def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
 
     def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
         codes = self.model.encode(x, self.n_quantizers)[1]
-        return codes, None
+        return codes[:, :self.n_quantizers], None
 
     def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
         assert scale is None
diff --git a/audiocraft/models/lm.py b/audiocraft/models/lm.py
index 8cefd2c5..c4ea2e5e 100644
--- a/audiocraft/models/lm.py
+++ b/audiocraft/models/lm.py
@@ -314,7 +314,8 @@ def _sample_next_token(self,
                            temp: float = 1.0,
                            top_k: int = 0,
                            top_p: float = 0.0,
-                           cfg_coef: tp.Optional[float] = None) -> torch.Tensor:
+                           cfg_coef: tp.Optional[float] = None,
+                           two_step_cfg: tp.Optional[bool] = None) -> torch.Tensor:
         """Sample next token from the model given a sequence and a set of conditions. The model supports
         multiple sampling strategies (greedy sampling, softmax, top-k, top-p...).
 
@@ -335,7 +336,8 @@ def _sample_next_token(self,
         B = sequence.shape[0]
         cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
         model = self if self._fsdp is None else self._fsdp
-        if self.two_step_cfg and cfg_conditions != {}:
+        two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
+        if two_step_cfg and cfg_conditions != {}:
             assert isinstance(cfg_conditions, tuple), type(cfg_conditions)
             condition_tensors, null_condition_tensors = cfg_conditions
             cond_logits = model(sequence, conditions=[], condition_tensors=condition_tensors)
@@ -493,7 +495,7 @@ def generate(self,
                 # sample next token from the model, next token shape is [B, K, 1]
                 next_token = self._sample_next_token(
                     curr_sequence, cfg_conditions, unconditional_state, use_sampling, temp, top_k, top_p,
-                    cfg_coef=cfg_coef)
+                    cfg_coef=cfg_coef, two_step_cfg=two_step_cfg)
                 # ensure the tokens that should be masked are properly set to special_token_id
                 # as the model never output special_token_id
                 valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
diff --git a/audiocraft/models/musicgen.py b/audiocraft/models/musicgen.py
index 557d1196..10312302 100644
--- a/audiocraft/models/musicgen.py
+++ b/audiocraft/models/musicgen.py
@@ -52,6 +52,10 @@ def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
         self.name = name
         self.compression_model = compression_model
         self.lm = lm
+        # Just to be safe, let's put everything in eval mode.
+        self.compression_model.eval()
+        self.lm.eval()
+
         if max_duration is None:
             if hasattr(lm, 'cfg'):
                 max_duration = lm.cfg.dataset.segment_duration  # type: ignore
diff --git a/audiocraft/modules/codebooks_patterns.py b/audiocraft/modules/codebooks_patterns.py
index 3cf3bb41..61362588 100644
--- a/audiocraft/modules/codebooks_patterns.py
+++ b/audiocraft/modules/codebooks_patterns.py
@@ -486,9 +486,14 @@ def get_pattern(self, timesteps: int) -> Pattern:
         return Pattern(out, n_q=self.n_q, timesteps=timesteps)
 
 
-class VALLEPattern(CodebooksPatternProvider):
-    """Almost VALL-E style pattern.
-    We further allow some delays for the codebooks other than the first one.
+class CoarseFirstPattern(CodebooksPatternProvider):
+    """First generates all the codebooks #1 (e.g. coarser), then the remaining ones,
+    potentially with delays.
+
+    ..Warning:: You must always generate the full training duration at test time, for instance,
+        30 seconds, as otherwise, the fine codebooks will start being generated in an unexpected
+        location. This is due to the non causality of the remaining codebooks with respect to
+        the first ones.
 
     Args:
         n_q (int): Number of codebooks.
diff --git a/audiocraft/modules/conditioners.py b/audiocraft/modules/conditioners.py
index d10ac8dc..046c3482 100644
--- a/audiocraft/modules/conditioners.py
+++ b/audiocraft/modules/conditioners.py
@@ -792,6 +792,8 @@ def __init__(self, dim: int, output_dim: int, device: str, attribute: str,
             import laion_clap  # type: ignore
         except ImportError:
             raise ImportError("Please install CLAP to use the CLAPEmbeddingConditioner: 'pip install laion_clap'")
+        warnings.warn("Sample rate for CLAP conditioner was fixed in version v1.1.0, (from 44.1 to 48 kHz). "
+                      "Please retrain all models.")
         checkpoint = AudioCraftEnvironment.resolve_reference_path(checkpoint)
         clap_tokenize = RobertaTokenizer.from_pretrained('roberta-base')
         clap_model = laion_clap.CLAP_Module(enable_fusion=enable_fusion, amodel=model_arch)
diff --git a/audiocraft/optim/fsdp.py b/audiocraft/optim/fsdp.py
index b3c1a55b..1090d3d7 100644
--- a/audiocraft/optim/fsdp.py
+++ b/audiocraft/optim/fsdp.py
@@ -143,8 +143,8 @@ def _name_without_fsdp_prefix(name: str) -> str:
         new_parts = [part for part in parts if part != FSDP_WRAPPED_MODULE]
         return '.'.join(new_parts)
 
-    def state_dict(self) -> tp.Dict[str, tp.Any]:  # type: ignore
-        state = dict(super().state_dict())
+    def state_dict(self, *args, **kwargs) -> tp.Dict[str, tp.Any]:  # type: ignore
+        state = dict(super().state_dict(*args, **kwargs))
         for key, value in list(state.items()):
             if is_sharded_tensor(value):
                 del state[key]
diff --git a/audiocraft/solvers/musicgen.py b/audiocraft/solvers/musicgen.py
index bb615abf..2439da33 100644
--- a/audiocraft/solvers/musicgen.py
+++ b/audiocraft/solvers/musicgen.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 import time
 import typing as tp
+import warnings
 
 import flashy
 import math
@@ -226,7 +227,6 @@ def _compute_cross_entropy(
         ce = ce / K
         return ce, ce_per_codebook
 
-    @torch.no_grad()
     def _prepare_tokens_and_attributes(
         self, batch: tp.Tuple[torch.Tensor, tp.List[SegmentWithAttributes]],
         check_synchronization_points: bool = False
@@ -243,6 +243,12 @@ def _prepare_tokens_and_attributes(
                 with B the batch size, K the number of codebooks, T_s the token timesteps.
             Padding mask (torch.Tensor): Mask with valid positions in the tokens tensor, of shape [B, K, T_s].
         """
+        if self.model.training:
+            warnings.warn(
+                "Up to version 1.0.1, the _prepare_tokens_and_attributes was evaluated with `torch.no_grad()`. "
+                "This is inconsistent with how model were trained in the MusicGen paper. We removed the "
+                "`torch.no_grad()` in version 1.1.0. Small changes to the final performance are expected. "
+                "Really sorry about that.")
         if self._cached_batch_loader is None or self.current_stage != "train":
             audio, infos = batch
             audio = audio.to(self.device)
@@ -533,7 +539,7 @@ def get_hydrated_conditions(meta: tp.List[SegmentWithAttributes]):
                     rtf = 1.
                 else:
                     gen_unprompted_outputs = self.run_generate_step(
-                        batch, gen_duration=target_duration, prompt_duration=prompt_duration,
+                        batch, gen_duration=target_duration, prompt_duration=None,
                         **self.generation_params)
                     gen_unprompted_audio = gen_unprompted_outputs['gen_audio'].cpu()
                     rtf = gen_unprompted_outputs['rtf']
diff --git a/audiocraft/utils/cache.py b/audiocraft/utils/cache.py
index 2fccc0ac..f7f82064 100644
--- a/audiocraft/utils/cache.py
+++ b/audiocraft/utils/cache.py
@@ -57,7 +57,7 @@ class EmbeddingCache:
             specify the index corresponding to the current embedding in the object that can represent batch metadata.
             If not specified, will return the full embedding unmodified.
     """
-    def __init__(self, cache_path: tp.Union[Path], device: tp.Union[str, torch.device],
+    def __init__(self, cache_path: tp.Union[str, Path], device: tp.Union[str, torch.device],
                  compute_embed_fn: tp.Callable[[Path, tp.Any, int], torch.Tensor],
                  extract_embed_fn: tp.Optional[tp.Callable[[torch.Tensor, tp.Any, int], torch.Tensor]] = None):
         self.cache_path = Path(cache_path)
diff --git a/config/conditioner/clapemb2music.yaml b/config/conditioner/clapemb2music.yaml
index 8500a826..d44ac774 100644
--- a/config/conditioner/clapemb2music.yaml
+++ b/config/conditioner/clapemb2music.yaml
@@ -23,7 +23,7 @@ conditioners:
       checkpoint: //reference/clap/music_audioset_epoch_15_esc_90.14.pt
       model_arch: 'HTSAT-base'
       enable_fusion: false
-      sample_rate: 44100
+      sample_rate: 48000
       max_audio_length: 10
       audio_stride: 1
       dim: 512
diff --git a/config/model/lm/audiogen_lm.yaml b/config/model/lm/audiogen_lm.yaml
index 696f7462..d17e7a93 100644
--- a/config/model/lm/audiogen_lm.yaml
+++ b/config/model/lm/audiogen_lm.yaml
@@ -18,7 +18,7 @@ codebooks_pattern:
     delays: [0, 0, 0, 0]
   music_lm:
     group_by: 2
-  valle:
+  coarse_first:
     delays: [0, 0, 0]
 
 transformer_lm:
diff --git a/config/model/lm/musicgen_lm.yaml b/config/model/lm/musicgen_lm.yaml
index 5bc87a62..be1fbc14 100644
--- a/config/model/lm/musicgen_lm.yaml
+++ b/config/model/lm/musicgen_lm.yaml
@@ -18,7 +18,7 @@ codebooks_pattern:
     delays: [0, 0, 0, 0]
   music_lm:
     group_by: 2
-  valle:
+  coarse_first:
     delays: [0, 0, 0]
 
 transformer_lm:
diff --git a/docs/MUSICGEN.md b/docs/MUSICGEN.md
index 606ce858..d76c33a7 100644
--- a/docs/MUSICGEN.md
+++ b/docs/MUSICGEN.md
@@ -143,6 +143,10 @@ We provide a dummy dataset containing just a few examples for illustrative purpo
 
 Please read first the [TRAINING documentation](./TRAINING.md), in particular the Environment Setup section.
 
+
+**Warning:** As of version 1.1.0, a few breaking changes were introduced. Check the [CHANGELOG.md](../CHANGELOG.md)
+file for more information. You might need to retrain some of your models.
+
 ### Example configurations and grids
 
 We provide configurations to reproduce the released models and our research.
diff --git a/tests/models/test_musicgen.py b/tests/models/test_musicgen.py
index 65618a9e..2b32ac5d 100644
--- a/tests/models/test_musicgen.py
+++ b/tests/models/test_musicgen.py
@@ -56,3 +56,10 @@ def test_generate_long(self):
         wav = mg.generate(
             ['youpi', 'lapin dort'])
         assert list(wav.shape) == [2, 1, 32000 * 4]
+
+    def test_generate_two_step_cfg(self):
+        mg = self.get_musicgen()
+        mg.set_generation_params(duration=2.0, extend_stride=2., two_step_cfg=True)
+        wav = mg.generate(
+            ['youpi', 'lapin dort'])
+        assert list(wav.shape) == [2, 1, 64000]

From aa4e633668f4a4efc95de71bfc6ec777405cd380 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexandre=20D=C3=A9fossez?=
 <adefossez@users.noreply.github.com>
Date: Tue, 7 Nov 2023 13:44:37 +0100
Subject: [PATCH 2/3] actual release 1.1.0 (#345)

* actual release

* test

* adding library name
---
 .github/actions/audiocraft_build/action.yml | 4 ++--
 CHANGELOG.md                                | 2 +-
 audiocraft/__init__.py                      | 2 +-
 audiocraft/models/loaders.py                | 5 ++++-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/.github/actions/audiocraft_build/action.yml b/.github/actions/audiocraft_build/action.yml
index b13c3626..b412cd02 100644
--- a/.github/actions/audiocraft_build/action.yml
+++ b/.github/actions/audiocraft_build/action.yml
@@ -21,8 +21,8 @@ runs:
       python3 -m venv env
       .  env/bin/activate
       python -m pip install --upgrade pip
-      pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-      pip install --pre xformers
+      pip install torch torchvision torchaudio
+      pip install xformers
       pip install -e '.[dev]'
   - name: System Dependencies
     shell: bash
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b8de9f7e..1b6b4c92 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 
-## [1.1.0a] - TBD
+## [1.1.0] - 2023-11-06
 
 Not using torchaudio anymore when writing audio files, relying instead directly on the commandline ffmpeg. Also not using it anymore for reading audio files, for similar reasons.
 
diff --git a/audiocraft/__init__.py b/audiocraft/__init__.py
index 130190f1..251afe7d 100644
--- a/audiocraft/__init__.py
+++ b/audiocraft/__init__.py
@@ -23,4 +23,4 @@
 # flake8: noqa
 from . import data, modules, models
 
-__version__ = '1.1.0a1'
+__version__ = '1.1.0'
diff --git a/audiocraft/models/loaders.py b/audiocraft/models/loaders.py
index 7fd49d84..f02ba115 100644
--- a/audiocraft/models/loaders.py
+++ b/audiocraft/models/loaders.py
@@ -27,6 +27,7 @@
 from omegaconf import OmegaConf, DictConfig
 import torch
 
+import audiocraft
 from . import builders
 from .encodec import CompressionModel
 
@@ -60,7 +61,9 @@ def _get_state_dict(
     else:
         assert filename is not None, "filename needs to be defined if using HF checkpoints"
 
-        file = hf_hub_download(repo_id=file_or_url_or_id, filename=filename, cache_dir=cache_dir)
+        file = hf_hub_download(
+            repo_id=file_or_url_or_id, filename=filename, cache_dir=cache_dir,
+            library_name="audiocraft", library_version=audiocraft.__version__)
         return torch.load(file, map_location=device)
 
 

From 5905d2ef2526a38d4231096696218b5d4d55752b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexandre=20D=C3=A9fossez?=
 <adefossez@users.noreply.github.com>
Date: Wed, 8 Nov 2023 14:06:58 +0100
Subject: [PATCH 3/3] Release v1.2.0 (#347)

* Stereo release (#26)

* wip

* fix bug in sample storage

* type hint

* wip

* fix bug in fsdp with pytorch 2.0.1

* fix kld

* stereo should be good

* plop

* plop

* updating

* updates

* linter

* fix tests

* doc and grid for stereo

* extra doc

* extra docs

* removed spurious nograd

* changelog

* revert back passt change

* fix n_quantizers

* changes

* adding warning

* extra warnings and tests

* missing changelog

* missing changes between audiogen and musicgen

* doc

* doc

* plop

* improvements to the demo

* updating colab

* forgot to update version

* nits
---
 CHANGELOG.md                                  |   4 +
 audiocraft/__init__.py                        |   2 +-
 .../musicgen_stereo_finetune_32khz.py         |  57 +++++++
 audiocraft/models/builders.py                 |  11 +-
 audiocraft/models/encodec.py                  | 113 ++++++++++++++
 audiocraft/models/musicgen.py                 |  16 +-
 audiocraft/modules/conditioners.py            |  11 +-
 audiocraft/utils/export_legacy.py             |  36 +++--
 audiocraft/utils/utils.py                     |   2 +-
 config/solver/musicgen/default.yaml           |  10 ++
 demos/musicgen_app.py                         | 147 +++++++++++++-----
 docs/MBD.md                                   |   2 +-
 docs/MUSICGEN.md                              |  59 ++++++-
 model_cards/MUSICGEN_MODEL_CARD.md            |  15 ++
 14 files changed, 417 insertions(+), 68 deletions(-)
 create mode 100644 audiocraft/grids/musicgen/musicgen_stereo_finetune_32khz.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1b6b4c92..6036b72f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## [1.2.0a] - TBD
+
+Adding stereo models.
+
 
 ## [1.1.0] - 2023-11-06
 
diff --git a/audiocraft/__init__.py b/audiocraft/__init__.py
index 251afe7d..8b7acf22 100644
--- a/audiocraft/__init__.py
+++ b/audiocraft/__init__.py
@@ -23,4 +23,4 @@
 # flake8: noqa
 from . import data, modules, models
 
-__version__ = '1.1.0'
+__version__ = '1.2.0a1'
diff --git a/audiocraft/grids/musicgen/musicgen_stereo_finetune_32khz.py b/audiocraft/grids/musicgen/musicgen_stereo_finetune_32khz.py
new file mode 100644
index 00000000..2904e73d
--- /dev/null
+++ b/audiocraft/grids/musicgen/musicgen_stereo_finetune_32khz.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from pathlib import Path
+from ._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+
+
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=32, partition=partitions)
+    launcher.bind_(solver='musicgen/musicgen_base_32khz')
+    # replace this by the desired music dataset, which needs to be stereo
+    launcher.bind_(dset='audio/example')
+
+    fsdp = {'autocast': False, 'fsdp.use': True}
+    medium = {'model/lm/model_scale': 'medium'}
+    large = {'model/lm/model_scale': 'large'}
+
+    cfg_low = {'classifier_free_guidance.training_dropout': 0.2}
+    wd_low = {'conditioners.description.t5.word_dropout': 0.2}
+
+    adam = {'optim.optimizer': 'adamw', 'optim.lr': 1e-4}
+
+    stereo = {
+        'codebooks_pattern.delay.delays': [0, 0, 1, 1, 2, 2, 3, 3],
+        'transformer_lm.n_q': 8,
+        'interleave_stereo_codebooks.use': True,
+        'channels': 2,
+    }
+
+    # You must follow the instructions in docs/MUSICGEN.md about the creation
+    # of the proper fine tuning checkpoints. We will assume they are stored under
+    # ~/checkpoints/{mode_name}.
+
+    checkpoints = Path.home() / 'checkpoints'
+
+    launcher.bind_(fsdp, stereo, {'optim.epochs': 100})
+
+    launcher.slurm_(gpus=32).bind_(label='32gpus')
+    with launcher.job_array():
+        sub = launcher.bind({'continue_from': str(checkpoints / 'stereo_finetune_musicgen-small.th')})
+        sub()
+
+    launcher.slurm_(gpus=64).bind_(label='64gpus')
+    with launcher.job_array():
+        sub = launcher.bind({'continue_from': str(checkpoints / 'stereo_finetune_musicgen-medium.th')})
+        sub(medium, adam)
+
+    launcher.slurm_(gpus=96).bind_(label='96gpus')
+    with launcher.job_array():
+        sub = launcher.bind({'continue_from': str(checkpoints / 'stereo_finetune_musicgen-large.th')})
+        sub(large, cfg_low, wd_low, adam, {'optim.max_norm': 3})
diff --git a/audiocraft/models/builders.py b/audiocraft/models/builders.py
index 272ce1b3..b7144874 100644
--- a/audiocraft/models/builders.py
+++ b/audiocraft/models/builders.py
@@ -15,7 +15,7 @@
 import omegaconf
 import torch
 
-from .encodec import CompressionModel, EncodecModel
+from .encodec import CompressionModel, EncodecModel, InterleaveStereoCompressionModel
 from .lm import LMModel
 from ..modules.codebooks_patterns import (
     CodebooksPatternProvider,
@@ -247,5 +247,12 @@ def get_debug_lm_model(device='cpu'):
 def get_wrapped_compression_model(
         compression_model: CompressionModel,
         cfg: omegaconf.DictConfig) -> CompressionModel:
-    # more to come.
+    if hasattr(cfg, 'interleave_stereo_codebooks'):
+        if cfg.interleave_stereo_codebooks.use:
+            kwargs = dict_from_config(cfg.interleave_stereo_codebooks)
+            kwargs.pop('use')
+            compression_model = InterleaveStereoCompressionModel(compression_model, **kwargs)
+    if hasattr(cfg, 'compression_model_n_q'):
+        if cfg.compression_model_n_q is not None:
+            compression_model.set_num_codebooks(cfg.compression_model_n_q)
     return compression_model
diff --git a/audiocraft/models/encodec.py b/audiocraft/models/encodec.py
index a3bd9274..d4e77a94 100644
--- a/audiocraft/models/encodec.py
+++ b/audiocraft/models/encodec.py
@@ -13,6 +13,7 @@
 from pathlib import Path
 import typing as tp
 
+from einops import rearrange
 import numpy as np
 import torch
 from torch import nn
@@ -391,3 +392,115 @@ def set_num_codebooks(self, n: int):
         if n not in self.possible_num_codebooks:
             raise ValueError(f"Allowed values for num codebooks: {self.possible_num_codebooks}")
         self._num_codebooks = n
+
+
+class InterleaveStereoCompressionModel(CompressionModel):
+    """Wraps a CompressionModel to support stereo inputs. The wrapped model
+    will be applied independently to the left and right channels, and both codebooks
+    will be interleaved. If the wrapped model returns a representation `[B, K ,T]` per
+    channel, then the output will be `[B, K * 2, T]`  or `[B, K, T * 2]` depending on
+    `per_timestep`.
+
+    Args:
+        model (CompressionModel): Compression model to wrap.
+        per_timestep (bool): Whether to interleave on the timestep dimension
+            or on the codebooks dimension.
+    """
+    def __init__(self, model: CompressionModel, per_timestep: bool = False):
+        super().__init__()
+        self.model = model
+        self.per_timestep = per_timestep
+        assert self.model.channels == 1, "Wrapped model is expected to be for monophonic audio"
+
+    @property
+    def total_codebooks(self):
+        return self.model.total_codebooks
+
+    @property
+    def num_codebooks(self):
+        """Active number of codebooks used by the quantizer.
+
+        ..Warning:: this reports the number of codebooks after the interleaving
+        of the codebooks!
+        """
+        return self.model.num_codebooks if self.per_timestep else self.model.num_codebooks * 2
+
+    def set_num_codebooks(self, n: int):
+        """Set the active number of codebooks used by the quantizer.
+
+        ..Warning:: this sets the number of codebooks before the interleaving!
+        """
+        self.model.set_num_codebooks(n)
+
+    @property
+    def num_virtual_steps(self) -> float:
+        """Return the number of virtual steps, e.g. one real step
+        will be split into that many steps.
+        """
+        return 2 if self.per_timestep else 1
+
+    @property
+    def frame_rate(self) -> float:
+        return self.model.frame_rate * self.num_virtual_steps
+
+    @property
+    def sample_rate(self) -> int:
+        return self.model.sample_rate
+
+    @property
+    def channels(self) -> int:
+        return 2
+
+    @property
+    def cardinality(self):
+        """Cardinality of each codebook.
+        """
+        return self.model.cardinality
+
+    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
+        raise NotImplementedError("Not supported, use encode and decode.")
+
+    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        B, C, T = x.shape
+        assert C == self.channels, f"Expecting stereo audio but audio num channels is {C}"
+
+        indices_c0, scales_c0 = self.model.encode(x[:, 0, ...].unsqueeze(1))
+        indices_c1, scales_c1 = self.model.encode(x[:, 1, ...].unsqueeze(1))
+        indices = torch.stack([indices_c0, indices_c1], dim=0)
+        scales: tp.Optional[torch.Tensor] = None
+        if scales_c0 is not None and scales_c1 is not None:
+            scales = torch.stack([scales_c0, scales_c1], dim=1)
+
+        if self.per_timestep:
+            indices = rearrange(indices, 'c b k t -> b k (t c)', c=2)
+        else:
+            indices = rearrange(indices, 'c b k t -> b (k c) t', c=2)
+
+        return (indices, scales)
+
+    def get_left_right_codes(self, codes: torch.Tensor) -> tp.Tuple[torch.Tensor, torch.Tensor]:
+        if self.per_timestep:
+            codes = rearrange(codes, 'b k (t c) -> c b k t', c=2)
+        else:
+            codes = rearrange(codes, 'b (k c) t -> c b k t', c=2)
+        return codes[0], codes[1]
+
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        B, K, T = codes.shape
+        assert T % self.num_virtual_steps == 0, "Provided codes' number of timesteps does not match"
+        assert K == self.num_codebooks, "Provided codes' number of codebooks does not match"
+
+        scale_c0, scale_c1 = None, None
+        if scale is not None:
+            assert scale.size(0) == B and scale.size(1) == 2, f"Scale has unexpected shape: {scale.shape}"
+            scale_c0 = scale[0, ...]
+            scale_c1 = scale[1, ...]
+
+        codes_c0, codes_c1 = self.get_left_right_codes(codes)
+        audio_c0 = self.model.decode(codes_c0, scale_c0)
+        audio_c1 = self.model.decode(codes_c1, scale_c1)
+        return torch.cat([audio_c0, audio_c1], dim=1)
+
+    def decode_latent(self, codes: torch.Tensor):
+        """Decode from the discrete codes to continuous latent space."""
+        raise NotImplementedError("Not supported by interleaved stereo wrapped models.")
diff --git a/audiocraft/models/musicgen.py b/audiocraft/models/musicgen.py
index 10312302..88ee13b6 100644
--- a/audiocraft/models/musicgen.py
+++ b/audiocraft/models/musicgen.py
@@ -12,11 +12,12 @@
 import typing as tp
 import warnings
 
+import omegaconf
 import torch
 
 from .encodec import CompressionModel
 from .lm import LMModel
-from .builders import get_debug_compression_model, get_debug_lm_model
+from .builders import get_debug_compression_model, get_debug_lm_model, get_wrapped_compression_model
 from .loaders import load_compression_model, load_lm_model
 from ..data.audio_utils import convert_audio
 from ..modules.conditioners import ConditioningAttributes, WavCondition
@@ -52,18 +53,28 @@ def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
         self.name = name
         self.compression_model = compression_model
         self.lm = lm
+        self.cfg: tp.Optional[omegaconf.DictConfig] = None
         # Just to be safe, let's put everything in eval mode.
         self.compression_model.eval()
         self.lm.eval()
 
+        if hasattr(lm, 'cfg'):
+            cfg = lm.cfg
+            assert isinstance(cfg, omegaconf.DictConfig)
+            self.cfg = cfg
+
+        if self.cfg is not None:
+            self.compression_model = get_wrapped_compression_model(self.compression_model, self.cfg)
+
         if max_duration is None:
-            if hasattr(lm, 'cfg'):
+            if self.cfg is not None:
                 max_duration = lm.cfg.dataset.segment_duration  # type: ignore
             else:
                 raise ValueError("You must provide max_duration when building directly MusicGen")
         assert max_duration is not None
         self.max_duration: float = max_duration
         self.device = next(iter(lm.parameters())).device
+
         self.generation_params: dict = {}
         self.set_generation_params(duration=15)  # 15 seconds by default
         self._progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None
@@ -122,6 +133,7 @@ def get_pretrained(name: str = 'facebook/musicgen-melody', device=None):
         compression_model = load_compression_model(name, device=device)
         if 'self_wav' in lm.condition_provider.conditioners:
             lm.condition_provider.conditioners['self_wav'].match_len_on_eval = True
+            lm.condition_provider.conditioners['self_wav']._use_masking = False
 
         return MusicGen(name, compression_model, lm)
 
diff --git a/audiocraft/modules/conditioners.py b/audiocraft/modules/conditioners.py
index 046c3482..178957d1 100644
--- a/audiocraft/modules/conditioners.py
+++ b/audiocraft/modules/conditioners.py
@@ -469,6 +469,8 @@ class WaveformConditioner(BaseConditioner):
     def __init__(self, dim: int, output_dim: int, device: tp.Union[torch.device, str]):
         super().__init__(dim, output_dim)
         self.device = device
+        # if False no masking is done, used in ChromaStemConditioner when completing by periodicity a sample.
+        self._use_masking = True
 
     def tokenize(self, x: WavCondition) -> WavCondition:
         wav, length, sample_rate, path, seek_time = x
@@ -496,13 +498,12 @@ def forward(self, x: WavCondition) -> ConditionType:
         embeds = embeds.to(self.output_proj.weight)
         embeds = self.output_proj(embeds)
 
-        if lengths is not None:
+        if lengths is not None and self._use_masking:
             lengths = lengths / self._downsampling_factor()
             mask = length_to_mask(lengths, max_len=embeds.shape[1]).int()  # type: ignore
         else:
-            mask = torch.ones_like(embeds)
-        embeds = (embeds * mask.unsqueeze(2).to(self.device))
-
+            mask = torch.ones_like(embeds[..., 0])
+        embeds = (embeds * mask.unsqueeze(-1))
         return embeds, mask
 
 
@@ -537,6 +538,8 @@ def __init__(self, output_dim: int, sample_rate: int, n_chroma: int, radix2_exp:
         self.autocast = TorchAutocast(enabled=device != 'cpu', device_type=self.device, dtype=torch.float32)
         self.sample_rate = sample_rate
         self.match_len_on_eval = match_len_on_eval
+        if match_len_on_eval:
+            self._use_masking = False
         self.duration = duration
         self.__dict__['demucs'] = pretrained.get_model('htdemucs').to(device)
         stem_sources: list = self.demucs.sources  # type: ignore
diff --git a/audiocraft/utils/export_legacy.py b/audiocraft/utils/export_legacy.py
index 52f145f3..367c3f3c 100644
--- a/audiocraft/utils/export_legacy.py
+++ b/audiocraft/utils/export_legacy.py
@@ -14,13 +14,21 @@
 from omegaconf import OmegaConf, DictConfig
 import torch
 
+from audiocraft import __version__
+
 
 def _clean_lm_cfg(cfg: DictConfig):
     OmegaConf.set_struct(cfg, False)
     # This used to be set automatically in the LM solver, need a more robust solution
     # for the future.
     cfg['transformer_lm']['card'] = 2048
-    cfg['transformer_lm']['n_q'] = 4
+    n_q = 4
+    stereo_cfg = getattr(cfg, 'interleave_stereo_codebooks', None)
+    if stereo_cfg is not None and stereo_cfg.use:
+        if 'downsample' in stereo_cfg:
+            del stereo_cfg['downsample']
+        n_q = 8
+    cfg['transformer_lm']['n_q'] = n_q
     # Experimental params no longer supported.
     bad_params = ['spectral_norm_attn_iters', 'spectral_norm_ff_iters',
                   'residual_balancer_attn', 'residual_balancer_ff', 'layer_drop']
@@ -30,27 +38,33 @@ def _clean_lm_cfg(cfg: DictConfig):
     return cfg
 
 
-def export_encodec(checkpoint_path: tp.Union[Path, str], out_folder: tp.Union[Path, str]):
-    sig = Path(checkpoint_path).parent.name
-    assert len(sig) == 8, "Not a valid Dora signature"
+def export_encodec(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
     pkg = torch.load(checkpoint_path, 'cpu')
     new_pkg = {
         'best_state': pkg['ema']['state']['model'],
         'xp.cfg': OmegaConf.to_yaml(pkg['xp.cfg']),
+        # The following params were NOT exported for the first release of MusicGen.
+        'version': __version__,
+        'exported': True,
     }
-    out_file = Path(out_folder) / f'{sig}.th'
+    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
     torch.save(new_pkg, out_file)
     return out_file
 
 
-def export_lm(checkpoint_path: tp.Union[Path, str], out_folder: tp.Union[Path, str]):
-    sig = Path(checkpoint_path).parent.name
-    assert len(sig) == 8, "Not a valid Dora signature"
+def export_lm(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
     pkg = torch.load(checkpoint_path, 'cpu')
+    if pkg['fsdp_best_state']:
+        best_state = pkg['fsdp_best_state']['model']
+    else:
+        best_state = pkg['best_state']['model']
     new_pkg = {
-        'best_state': pkg['fsdp_best_state']['model'],
-        'xp.cfg': OmegaConf.to_yaml(_clean_lm_cfg(pkg['xp.cfg']))
+        'best_state': best_state,
+        'xp.cfg': OmegaConf.to_yaml(_clean_lm_cfg(pkg['xp.cfg'])),
+        # The following params were NOT exported for the first release of MusicGen.
+        'version': __version__,
+        'exported': True,
     }
-    out_file = Path(out_folder) / f'{sig}.th'
+    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
     torch.save(new_pkg, out_file)
     return out_file
diff --git a/audiocraft/utils/utils.py b/audiocraft/utils/utils.py
index 3135d70e..2c5799f8 100644
--- a/audiocraft/utils/utils.py
+++ b/audiocraft/utils/utils.py
@@ -185,7 +185,7 @@ def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -> t
     assert len(lengths.shape) == 1, "Length shape should be 1 dimensional."
     final_length = lengths.max().item() if not max_len else max_len
     final_length = max(final_length, 1)  # if all seqs are of len zero we don't want a zero-size tensor
-    return torch.arange(final_length)[None, :].to(lengths.device) < lengths[:, None]
+    return torch.arange(final_length, device=lengths.device)[None, :] < lengths[:, None]
 
 
 def hash_trick(word: str, vocab_size: int) -> int:
diff --git a/config/solver/musicgen/default.yaml b/config/solver/musicgen/default.yaml
index 59e01137..8bdf9c74 100644
--- a/config/solver/musicgen/default.yaml
+++ b/config/solver/musicgen/default.yaml
@@ -14,10 +14,20 @@ solver: musicgen
 sample_rate: ???
 channels: ???
 compression_model_checkpoint: ???
+# The following will set the num codebooks on the underlying
+# model, this might be different from the actual value for n_q
+# given to the transformer, when the model output is postprocessed, for instance
+# for stereo channels. If not provided, default value for the compression model
+# will be used.
+compression_model_n_q: null
 
 tokens:
   padding_with_special_token: false
 
+interleave_stereo_codebooks:
+  use: false
+  per_timestep: false
+
 cache:
   path:
   write: false
diff --git a/demos/musicgen_app.py b/demos/musicgen_app.py
index 74c893e7..a10d52b5 100644
--- a/demos/musicgen_app.py
+++ b/demos/musicgen_app.py
@@ -9,24 +9,29 @@
 
 import argparse
 from concurrent.futures import ProcessPoolExecutor
+import logging
 import os
 from pathlib import Path
 import subprocess as sp
+import sys
 from tempfile import NamedTemporaryFile
 import time
 import typing as tp
 import warnings
 
+from einops import rearrange
 import torch
 import gradio as gr
 
 from audiocraft.data.audio_utils import convert_audio
 from audiocraft.data.audio import audio_write
+from audiocraft.models.encodec import InterleaveStereoCompressionModel
 from audiocraft.models import MusicGen, MultiBandDiffusion
 
 
 MODEL = None  # Last used model
-IS_BATCHED = "facebook/MusicGen" in os.environ.get('SPACE_ID', '')
+SPACE_ID = os.environ.get('SPACE_ID', '')
+IS_BATCHED = "facebook/MusicGen" in SPACE_ID or 'musicgen-internal/musicgen_dev' in SPACE_ID
 print(IS_BATCHED)
 MAX_BATCH_SIZE = 12
 BATCHED_DURATION = 15
@@ -91,6 +96,7 @@ def load_model(version='facebook/musicgen-melody'):
     global MODEL
     print("Loading model", version)
     if MODEL is None or MODEL.name != version:
+        MODEL = None  # in case loading would crash
         MODEL = MusicGen.get_pretrained(version)
 
 
@@ -101,7 +107,7 @@ def load_diffusion():
         MBD = MultiBandDiffusion.get_mbd_musicgen()
 
 
-def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
+def _do_predictions(texts, melodies, duration, progress=False, gradio_progress=None, **gen_kwargs):
     MODEL.set_generation_params(duration=duration, **gen_kwargs)
     print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
     be = time.time()
@@ -119,18 +125,30 @@ def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
             melody = convert_audio(melody, sr, target_sr, target_ac)
             processed_melodies.append(melody)
 
-    if any(m is not None for m in processed_melodies):
-        outputs = MODEL.generate_with_chroma(
-            descriptions=texts,
-            melody_wavs=processed_melodies,
-            melody_sample_rate=target_sr,
-            progress=progress,
-            return_tokens=USE_DIFFUSION
-        )
-    else:
-        outputs = MODEL.generate(texts, progress=progress, return_tokens=USE_DIFFUSION)
+    try:
+        if any(m is not None for m in processed_melodies):
+            outputs = MODEL.generate_with_chroma(
+                descriptions=texts,
+                melody_wavs=processed_melodies,
+                melody_sample_rate=target_sr,
+                progress=progress,
+                return_tokens=USE_DIFFUSION
+            )
+        else:
+            outputs = MODEL.generate(texts, progress=progress, return_tokens=USE_DIFFUSION)
+    except RuntimeError as e:
+        raise gr.Error("Error while generating " + e.args[0])
     if USE_DIFFUSION:
-        outputs_diffusion = MBD.tokens_to_wav(outputs[1])
+        if gradio_progress is not None:
+            gradio_progress(1, desc='Running MultiBandDiffusion...')
+        tokens = outputs[1]
+        if isinstance(MODEL.compression_model, InterleaveStereoCompressionModel):
+            left, right = MODEL.compression_model.get_left_right_codes(tokens)
+            tokens = torch.cat([left, right])
+        outputs_diffusion = MBD.tokens_to_wav(tokens)
+        if isinstance(MODEL.compression_model, InterleaveStereoCompressionModel):
+            assert outputs_diffusion.shape[1] == 1  # output is mono
+            outputs_diffusion = rearrange(outputs_diffusion, '(s b) c t -> b (s c) t', s=2)
         outputs = torch.cat([outputs[0], outputs_diffusion], dim=0)
     outputs = outputs.detach().cpu().float()
     pending_videos = []
@@ -154,15 +172,24 @@ def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
 def predict_batched(texts, melodies):
     max_text_length = 512
     texts = [text[:max_text_length] for text in texts]
-    load_model('facebook/musicgen-melody')
+    load_model('facebook/musicgen-stereo-melody')
     res = _do_predictions(texts, melodies, BATCHED_DURATION)
     return res
 
 
-def predict_full(model, decoder, text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
+def predict_full(model, model_path, decoder, text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
     global INTERRUPTING
     global USE_DIFFUSION
     INTERRUPTING = False
+    progress(0, desc="Loading model...")
+    model_path = model_path.strip()
+    if model_path:
+        if not Path(model_path).exists():
+            raise gr.Error(f"Model path {model_path} doesn't exist.")
+        if not Path(model_path).is_dir():
+            raise gr.Error(f"Model path {model_path} must be a folder containing "
+                           "state_dict.bin and compression_state_dict_.bin.")
+        model = model_path
     if temperature < 0:
         raise gr.Error("Temperature must be >= 0.")
     if topk < 0:
@@ -173,20 +200,26 @@ def predict_full(model, decoder, text, melody, duration, topk, topp, temperature
     topk = int(topk)
     if decoder == "MultiBand_Diffusion":
         USE_DIFFUSION = True
+        progress(0, desc="Loading diffusion model...")
         load_diffusion()
     else:
         USE_DIFFUSION = False
     load_model(model)
 
+    max_generated = 0
+
     def _progress(generated, to_generate):
-        progress((min(generated, to_generate), to_generate))
+        nonlocal max_generated
+        max_generated = max(generated, max_generated)
+        progress((min(max_generated, to_generate), to_generate))
         if INTERRUPTING:
             raise gr.Error("Interrupted.")
     MODEL.set_custom_progress_callback(_progress)
 
     videos, wavs = _do_predictions(
         [text], [melody], duration, progress=True,
-        top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef)
+        top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef,
+        gradio_progress=progress)
     if USE_DIFFUSION:
         return videos[0], wavs[0], videos[1], wavs[1]
     return videos[0], wavs[0], None, None
@@ -231,8 +264,12 @@ def ui_full(launch_kwargs):
                     _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
                 with gr.Row():
                     model = gr.Radio(["facebook/musicgen-melody", "facebook/musicgen-medium", "facebook/musicgen-small",
-                                      "facebook/musicgen-large"],
-                                     label="Model", value="facebook/musicgen-melody", interactive=True)
+                                      "facebook/musicgen-large", "facebook/musicgen-melody-large",
+                                      "facebook/musicgen-stereo-small", "facebook/musicgen-stereo-medium",
+                                      "facebook/musicgen-stereo-melody", "facebook/musicgen-stereo-large",
+                                      "facebook/musicgen-stereo-melody-large"],
+                                     label="Model", value="facebook/musicgen-stereo-melody", interactive=True)
+                    model_path = gr.Text(label="Model Path (custom models)")
                 with gr.Row():
                     decoder = gr.Radio(["Default", "MultiBand_Diffusion"],
                                        label="Decoder", value="Default", interactive=True)
@@ -249,7 +286,7 @@ def ui_full(launch_kwargs):
                 diffusion_output = gr.Video(label="MultiBand Diffusion Decoder")
                 audio_diffusion = gr.Audio(label="MultiBand Diffusion Decoder (wav)", type='filepath')
         submit.click(toggle_diffusion, decoder, [diffusion_output, audio_diffusion], queue=False,
-                     show_progress=False).then(predict_full, inputs=[model, decoder, text, melody, duration, topk, topp,
+                     show_progress=False).then(predict_full, inputs=[model, model_path, decoder, text, melody, duration, topk, topp,
                                                                      temperature, cfg_coef],
                                                outputs=[output, audio_output, diffusion_output, audio_diffusion])
         radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
@@ -260,37 +297,37 @@ def ui_full(launch_kwargs):
                 [
                     "An 80s driving pop song with heavy drums and synth pads in the background",
                     "./assets/bach.mp3",
-                    "facebook/musicgen-melody",
+                    "facebook/musicgen-stereo-melody",
                     "Default"
                 ],
                 [
                     "A cheerful country song with acoustic guitars",
                     "./assets/bolero_ravel.mp3",
-                    "facebook/musicgen-melody",
+                    "facebook/musicgen-stereo-melody",
                     "Default"
                 ],
                 [
                     "90s rock song with electric guitar and heavy drums",
                     None,
-                    "facebook/musicgen-medium",
+                    "facebook/musicgen-stereo-medium",
                     "Default"
                 ],
                 [
                     "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
                     "./assets/bach.mp3",
-                    "facebook/musicgen-melody",
+                    "facebook/musicgen-stereo-melody",
                     "Default"
                 ],
                 [
                     "lofi slow bpm electro chill with organic samples",
                     None,
-                    "facebook/musicgen-medium",
+                    "facebook/musicgen-stereo-medium",
                     "Default"
                 ],
                 [
                     "Punk rock with loud drum and power guitar",
                     None,
-                    "facebook/musicgen-medium",
+                    "facebook/musicgen-stereo-medium",
                     "MultiBand_Diffusion"
                 ],
             ],
@@ -302,8 +339,18 @@ def ui_full(launch_kwargs):
             ### More details
 
             The model will generate a short music extract based on the description you provided.
-            The model can generate up to 30 seconds of audio in one pass. It is now possible
-            to extend the generation by feeding back the end of the previous chunk of audio.
+            The model can generate up to 30 seconds of audio in one pass.
+
+            The model was trained with description from a stock music catalog, descriptions that will work best
+            should include some level of details on the instruments present, along with some intended use case
+            (e.g. adding "perfect for a commercial" can somehow help).
+
+            Using one of the `melody` model (e.g. `musicgen-melody-*`), you can optionally provide a reference audio
+            from which a broad melody will be extracted.
+            The model will then try to follow both the description and melody provided.
+            For best results, the melody should be 30 seconds long (I know, the samples we provide are not...)
+
+            It is now possible to extend the generation by feeding back the end of the previous chunk of audio.
             This can take a long time, and the model might lose consistency. The model might also
             decide at arbitrary positions that the song ends.
 
@@ -311,23 +358,23 @@ def ui_full(launch_kwargs):
             An overlap of 12 seconds is kept with the previously generated chunk, and 18 "new" seconds
             are generated each time.
 
-            We present 4 model variations:
+            We present 10 model variations:
             1. facebook/musicgen-melody -- a music generation model capable of generating music condition
                 on text and melody inputs. **Note**, you can also use text only.
             2. facebook/musicgen-small -- a 300M transformer decoder conditioned on text only.
             3. facebook/musicgen-medium -- a 1.5B transformer decoder conditioned on text only.
             4. facebook/musicgen-large -- a 3.3B transformer decoder conditioned on text only.
+            5. facebook/musicgen-melody-large -- a 3.3B transformer decoder conditioned on and melody.
+            6. facebook/musicgen-stereo-*: same as the previous models but fine tuned to output stereo audio.
 
             We also present two way of decoding the audio tokens
-            1. Use the default GAN based compression model
-            2. Use MultiBand Diffusion from (paper linknano )
+            1. Use the default GAN based compression model. It can suffer from artifacts especially
+                for crashes, snares etc.
+            2. Use [MultiBand Diffusion](https://arxiv.org/abs/2308.02560). Should improve the audio quality,
+                at an extra computational cost. When this is selected, we provide both the GAN based decoded
+                audio, and the one obtained with MBD.
 
-            When using `facebook/musicgen-melody`, you can optionally provide a reference audio from
-            which a broad melody will be extracted. The model will then try to follow both
-            the description and melody provided.
-
-            You can also use your own GPU or a Google Colab by following the instructions on our repo.
-            See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
+            See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft/blob/main/docs/MUSICGEN.md)
             for more details.
             """
         )
@@ -341,7 +388,7 @@ def ui_batched(launch_kwargs):
             """
             # MusicGen
 
-            This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
+            This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft/blob/main/docs/MUSICGEN.md),
             a simple and controllable model for music generation
             presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
             <br/>
@@ -399,15 +446,27 @@ def ui_batched(launch_kwargs):
         gr.Markdown("""
         ### More details
 
-        The model will generate 12 seconds of audio based on the description you provided.
+        The model will generate 15 seconds of audio based on the description you provided.
+        The model was trained with description from a stock music catalog, descriptions that will work best
+        should include some level of details on the instruments present, along with some intended use case
+        (e.g. adding "perfect for a commercial" can somehow help).
+
         You can optionally provide a reference audio from which a broad melody will be extracted.
         The model will then try to follow both the description and melody provided.
-        All samples are generated with the `melody` model.
+        For best results, the melody should be 30 seconds long (I know, the samples we provide are not...)
 
-        You can also use your own GPU or a Google Colab by following the instructions on our repo.
+        You can access more control (longer generation, more models etc.) by clicking
+        the <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
+                style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
+            <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
+                src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
+        (you will then need a paid GPU from HuggingFace).
+        If you have a GPU, you can run the gradio demo locally (click the link to our repo below for more info).
+        Finally, you can get a GPU for free from Google
+        and run the demo in [a Google Colab.](https://ai.honu.io/red/musicgen-colab).
 
-        See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
-        for more details.
+        See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft/blob/main/docs/MUSICGEN.md)
+        for more details. All samples are generated with the `stereo-melody` model.
         """)
 
         demo.queue(max_size=8 * 4).launch(**launch_kwargs)
@@ -454,6 +513,8 @@ def ui_batched(launch_kwargs):
     if args.share:
         launch_kwargs['share'] = args.share
 
+    logging.basicConfig(level=logging.INFO, stream=sys.stderr)
+
     # Show the interface
     if IS_BATCHED:
         global USE_DIFFUSION
diff --git a/docs/MBD.md b/docs/MBD.md
index 4288a89d..b6629184 100644
--- a/docs/MBD.md
+++ b/docs/MBD.md
@@ -113,5 +113,5 @@ Learn more about AudioCraft training pipelines in the [dedicated section](./TRAI
 See license information in the [README](../README.md).
 
 
-[arxiv]: https://dl.fbaipublicfiles.com/encodec/Diffusion/paper.pdf
+[arxiv]: https://arxiv.org/abs/2308.02560
 [mbd_samples]: https://ai.honu.io/papers/mbd/
diff --git a/docs/MUSICGEN.md b/docs/MUSICGEN.md
index d76c33a7..fb12e324 100644
--- a/docs/MUSICGEN.md
+++ b/docs/MUSICGEN.md
@@ -9,7 +9,7 @@ a small delay between the codebooks, we show we can predict them in parallel, th
 steps per second of audio.
 Check out our [sample page][musicgen_samples] or test the available demo!
 
-<a target="_blank" href="https://colab.research.google.com/drive/1JlTOjB-G0A2Hz3h8PK63vLZk4xdCI5QB?usp=sharing">
+<a target="_blank" href="https://ai.honu.io/red/musicgen-colab">
   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a>
 <a target="_blank" href="https://huggingface.co/spaces/facebook/MusicGen">
@@ -38,7 +38,7 @@ We offer a number of way to interact with MusicGen:
 1. A demo is also available on the [`facebook/MusicGen` Hugging Face Space](https://huggingface.co/spaces/facebook/MusicGen)
 (huge thanks to all the HF team for their support).
 2. You can run the extended demo on a Colab:
-[colab notebook](https://colab.research.google.com/drive/1JlTOjB-G0A2Hz3h8PK63vLZk4xdCI5QB?usp=sharing)
+[colab notebook](https://ai.honu.io/red/musicgen-colab)
 3. You can use the gradio demo locally by running [`python -m demos.musicgen_app --share`](../demos/musicgen_app.py).
 4. You can play with MusicGen by running the jupyter notebook at [`demos/musicgen_demo.ipynb`](../demos/musicgen_demo.ipynb) locally (if you have a GPU).
 5. Finally, checkout [@camenduru Colab page](https://github.com/camenduru/MusicGen-colab)
@@ -47,11 +47,18 @@ which is regularly updated with contributions from @camenduru and the community.
 
 ## API
 
-We provide a simple API and 4 pre-trained models. The pre trained models are:
+We provide a simple API and 10 pre-trained models. The pre trained models are:
 - `facebook/musicgen-small`: 300M model, text to music only - [🤗 Hub](https://huggingface.co/facebook/musicgen-small)
 - `facebook/musicgen-medium`: 1.5B model, text to music only - [🤗 Hub](https://huggingface.co/facebook/musicgen-medium)
 - `facebook/musicgen-melody`: 1.5B model, text to music and text+melody to music - [🤗 Hub](https://huggingface.co/facebook/musicgen-melody)
 - `facebook/musicgen-large`: 3.3B model, text to music only - [🤗 Hub](https://huggingface.co/facebook/musicgen-large)
+- `facebook/musicgen-melody-large`: 3.3B model, text to music and text+melody to music - [🤗 Hub](https://huggingface.co/facebook/musicgen-melody-large)
+- `facebook/musicgen-stereo-*`: All the previous models fine tuned for stereo generation -
+    [small](https://huggingface.co/facebook/musicgen-stereo-small),
+    [medium](https://huggingface.co/facebook/musicgen-stereo-medium),
+    [large](https://huggingface.co/facebook/musicgen-stereo-large),
+    [melody](https://huggingface.co/facebook/musicgen-stereo-melody),
+    [melody large](https://huggingface.co/facebook/musicgen-stereo-melody-large).
 
 We observe the best trade-off between quality and compute with the `facebook/musicgen-medium` or `facebook/musicgen-melody` model.
 In order to use MusicGen locally **you must have a GPU**. We recommend 16GB of memory, but smaller
@@ -209,6 +216,19 @@ dora run solver=musicgen/debug \
 **Warning:** you are responsible for setting the proper value for `transformer_lm.n_q` and `transformer_lm.card` (cardinality of the codebooks). You also have to update the codebook_pattern to match `n_q` as shown in the example for using DAC. .
 
 
+### Training stereo models
+
+Use the option `interleave_stereo_codebooks.use` set to `True` to activate stereo training along with `channels=2`. Left and right channels will be
+encoded separately by the compression model, then their codebook will be interleaved, e.g. order of codebook is
+`[1_L, 1_R, 2_L, 2_R, ...]`. You will also need to update the delays for the codebook patterns to match the number of codebooks, and the `n_q` value passed to the transformer LM:
+```
+dora run solver=musicgen/debug \
+    compression_model_checkpoint=//pretrained/facebook/encodec_32khz \
+    channels=2 interleave_stereo_codebooks.use=True \
+    transformer_lm.n_q=8 transformer_lm.card=2048 \
+    codebooks_pattern.delay.delays='[0, 0, 1, 1, 2, 2, 3, 3]'
+```
+
 ### Fine tuning existing models
 
 You can initialize your model to one of the pretrained models by using the `continue_from` argument, in particular
@@ -232,6 +252,39 @@ dora run solver=musicgen/musicgen_base_32khz model/lm/model_scale=medium continu
  If you decide to do so, make sure your checkpoint is saved with `torch.save` and contains a dict
     `{'best_state': {'model': model_state_dict_here}}`. Directly give the path to `continue_from` without a `//pretrained/` prefix.
 
+
+#### Fine tuning mono model to stereo
+
+You will not be able to `continue_from` a mono model with stereo training, as the shape of the embeddings and output linears
+would not match. You can use the following snippet to prepare a proper finetuning checkpoint.
+
+```python
+from pathlib import Path
+import torch
+
+# Download the pretrained model, e.g. from
+# https://huggingface.co/facebook/musicgen-melody/blob/main/state_dict.bin
+
+model_name = 'musicgen-melody'
+root = Path.home() / 'checkpoints'
+# You are responsible for downloading the following checkpoint in the proper location
+input_state_dict_path = root / model_name / 'state_dict.bin'
+state = torch.load(input_state_dict_path, 'cpu')
+bs = state['best_state']
+# there is a slight different in format between training checkpoints and exported public checkpoints.
+# If you want to use your own mono models from one of your training checkpont, following the instructions
+# for exporting a model explained later on this page.
+assert 'model' not in bs, 'The following code is for using an exported pretrained model'
+nbs = dict(bs)
+for k in range(8):
+    # We will just copy mono embeddings and linears twice, once for left and right channels.
+    nbs[f'linears.{k}.weight'] = bs[f'linears.{k//2}.weight']
+    nbs[f'emb.{k}.weight'] = bs[f'emb.{k//2}.weight']
+torch.save({'best_state': {'model': nbs}}, root / f'stereo_finetune_{model_name}.th')
+```
+
+Now, you can use `$HOME/checkpoints/stereo_finetune_musicgen-melody.th` as a `continue_from` target (without a `//pretrained` prefix!).
+
 ### Caching of EnCodec tokens
 
 It is possible to precompute the EnCodec tokens and other metadata.
diff --git a/model_cards/MUSICGEN_MODEL_CARD.md b/model_cards/MUSICGEN_MODEL_CARD.md
index 95431368..68e81d44 100644
--- a/model_cards/MUSICGEN_MODEL_CARD.md
+++ b/model_cards/MUSICGEN_MODEL_CARD.md
@@ -87,4 +87,19 @@ More information can be found in the paper [Simple and Controllable Music Genera
 
 **Use cases:** Users must be aware of the biases, limitations and risks of the model. MusicGen is a model developed for artificial intelligence research on controllable music generation. As such, it should not be used for downstream applications without further investigation and mitigation of risks.
 
+## Update: stereo models and large melody.
+
+We further release a set of stereophonic capable models. Those were fine tuned for 200k updates starting
+from the mono models. The training data is otherwise identical and capabilities and limitations are shared with the base modes. The stereo models work by getting 2 streams of tokens from the EnCodec model, and interleaving those using
+the delay pattern. We also release a mono large model with melody conditioning capabilities. The list of new models
+is as follow:
+
+- facebook/musicgen-stereo-small
+- facebook/musicgen-stereo-medium
+- facebook/musicgen-stereo-large
+- facebook/musicgen-stereo-melody
+- facebook/musicgen-melody-large
+- facebook/musicgen-stereo-melody-large
+
+
 [arxiv]: https://arxiv.org/abs/2306.05284