Removed unnecessary duplicate stem selection param from demucs, teste…

…d single stem mode, updated docs, bumped version
nomadkaraoke · Feb 22, 2024 · a89f9ca · a89f9ca
1 parent 437d1a0
commit a89f9ca
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -117,10 +117,11 @@ If the GPU isn't being detected, make sure your docker runtime environment is pa
 You can use Audio Separator via the command line:
 
 ```sh
-usage: audio-separator [-h] [-v] [-d] [-e] [-l] [--log_level LOG_LEVEL] [-m MODEL_FILENAME] [--output_format OUTPUT_FORMAT] [--output_dir OUTPUT_DIR] [--model_file_dir MODEL_FILE_DIR] [--denoise] [--invert_spect]
+usage: audio-separator [-h] [-v] [-d] [-e] [-l] [--log_level LOG_LEVEL] [-m MODEL_FILENAME] [--output_format OUTPUT_FORMAT] [--output_dir OUTPUT_DIR] [--model_file_dir MODEL_FILE_DIR] [--invert_spect]
                        [--normalization NORMALIZATION] [--single_stem SINGLE_STEM] [--sample_rate SAMPLE_RATE] [--mdx_segment_size MDX_SEGMENT_SIZE] [--mdx_overlap MDX_OVERLAP] [--mdx_batch_size MDX_BATCH_SIZE]
-                       [--mdx_hop_length MDX_HOP_LENGTH] [--vr_batch_size VR_BATCH_SIZE] [--vr_window_size VR_WINDOW_SIZE] [--vr_aggression VR_AGGRESSION] [--vr_enable_tta] [--vr_high_end_process]
-                       [--vr_enable_post_process] [--vr_post_process_threshold VR_POST_PROCESS_THRESHOLD]
+                       [--mdx_hop_length MDX_HOP_LENGTH] [--mdx_enable_denoise] [--vr_batch_size VR_BATCH_SIZE] [--vr_window_size VR_WINDOW_SIZE] [--vr_aggression VR_AGGRESSION] [--vr_enable_tta]
+                       [--vr_high_end_process] [--vr_enable_post_process] [--vr_post_process_threshold VR_POST_PROCESS_THRESHOLD] [--demucs_stem DEMUCS_STEM] [--demucs_segment_size DEMUCS_SEGMENT_SIZE]
+                       [--demucs_shifts DEMUCS_SHIFTS] [--demucs_overlap DEMUCS_OVERLAP] [--demucs_segments_enabled DEMUCS_SEGMENTS_ENABLED]
                        [audio_file]
 
 Separate audio file into different stems.
@@ -145,7 +146,6 @@ Separation I/O Params:
   --model_file_dir MODEL_FILE_DIR                        model files directory (default: /tmp/audio-separator-models/). Example: --model_file_dir=/app/models
 
 Common Separation Parameters:
-  --denoise                                              enable denoising during separation (default: False). Example: --denoise
   --invert_spect                                         invert secondary stem using spectogram (default: False). Example: --invert_spect
   --normalization NORMALIZATION                          max peak amplitude to normalize input and output audio to (default: 0.9). Example: --normalization=0.7
   --single_stem SINGLE_STEM                              output only single stem, either instrumental or vocals. Example: --single_stem=instrumental
@@ -156,6 +156,7 @@ MDX Architecture Parameters:
   --mdx_overlap MDX_OVERLAP                              amount of overlap between prediction windows, 0.001-0.999. higher is better but slower (default: 0.25). Example: --mdx_overlap=0.25
   --mdx_batch_size MDX_BATCH_SIZE                        larger consumes more RAM but may process slightly faster (default: 1). Example: --mdx_batch_size=4
   --mdx_hop_length MDX_HOP_LENGTH                        usually called stride in neural networks, only change if you know what you're doing (default: 1024). Example: --mdx_hop_length=1024
+  --mdx_enable_denoise                                   enable denoising during separation (default: False). Example: --mdx_enable_denoise
 
 VR Architecture Parameters:
   --vr_batch_size VR_BATCH_SIZE                          number of batches to process at a time. higher = more RAM, slightly faster processing (default: 4). Example: --vr_batch_size=16
@@ -165,6 +166,13 @@ VR Architecture Parameters:
   --vr_high_end_process                                  mirror the missing frequency range of the output (default: False). Example: --vr_high_end_process
   --vr_enable_post_process                               identify leftover artifacts within vocal output; may improve separation for some songs (default: False). Example: --vr_enable_post_process
   --vr_post_process_threshold VR_POST_PROCESS_THRESHOLD  threshold for post_process feature: 0.1-0.3 (default: 0.2). Example: --vr_post_process_threshold=0.1
+
+Demucs Architecture Parameters:
+  --demucs_stem DEMUCS_STEM                              stem to extract from audio file, e.g. Vocals, Drums, Bass, Other (default: All Stems). Example: --demucs_stem=vocals
+  --demucs_segment_size DEMUCS_SEGMENT_SIZE              size of segments into which the audio is split, 1-100. higher = slower but better quality (default: Default). Example: --demucs_segment_size=256
+  --demucs_shifts DEMUCS_SHIFTS                          number of predictions with random shifts, higher = slower but better quality (default: 2). Example: --demucs_shifts=4
+  --demucs_overlap DEMUCS_OVERLAP                        overlap between prediction windows, 0.001-0.999. higher = slower but better quality (default: 0.25). Example: --demucs_overlap=0.25
+  --demucs_segments_enabled DEMUCS_SEGMENTS_ENABLED      enable segment-wise processing (default: True). Example: --demucs_segments_enabled=False
 ```
 
 Example:
@@ -234,12 +242,12 @@ output_file_paths_6 = separator.separate('audio3.wav')
 - secondary_stem_output_path: (Optional) The path for saving the secondary stem. Default: None
 - output_format: (Optional) Format to encode output files, any common format (WAV, MP3, FLAC, M4A, etc.). Default: WAV
 - normalization_threshold: (Optional) The threshold for audio normalization. Default: 0.9
-- enable_denoise: (Optional) Flag to enable or disable denoising as part of the separation process. Default: False
 - output_single_stem: (Optional) Output only a single stem, either 'instrumental' or 'vocals'. Default: None
 - invert_using_spec: (Optional) Flag to invert using spectrogram. Default: False
 - sample_rate: (Optional) Modify the sample rate of the output audio. Default: 44100
 - mdx_params: (Optional) MDX Architecture Specific Attributes & Defaults. Default: {"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1}
 - vr_params: (Optional) VR Architecture Specific Attributes & Defaults. Default: {"batch_size": 16, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False}
+- demucs_params: (Optional) VR Architecture Specific Attributes & Defaults. {"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True}
 
 ## Requirements 📋
 

diff --git a/audio_separator/separator/architectures/demucs_separator.py b/audio_separator/separator/architectures/demucs_separator.py
@@ -36,14 +36,6 @@ def __init__(self, common_config, arch_config):
 
         # Initializing user-configurable parameters, passed through with an mdx_from the CLI or Separator instance
 
-        # 'Select a stem for extraction with the chosen model:\n\n'
-        # '• All Stems - Extracts all available stems.\n'
-        # '• Vocals - Only the "vocals" stem.\n'
-        # '• Other - Only the "other" stem.\n'
-        # '• Bass - Only the "bass" stem.\n'
-        # '• Drums - Only the "drums" stem.'
-        self.selected_stem = arch_config.get("selected_stem", [CommonSeparator.ALL_STEMS])
-
         # Adjust segments to manage RAM or V-RAM usage:
         # - Smaller sizes consume less resources.
         # - Bigger sizes consume more resources, but may provide better results.
@@ -73,7 +65,7 @@ def __init__(self, common_config, arch_config):
         self.segments_enabled = arch_config.get("segments_enabled", True)
 
         self.logger.debug(f"Demucs arch params: segment_size={self.segment_size}, segments_enabled={self.segments_enabled}")
-        self.logger.debug(f"Demucs arch params: shifts={self.shifts}, overlap={self.overlap}, selected_stem={self.selected_stem}")
+        self.logger.debug(f"Demucs arch params: shifts={self.shifts}, overlap={self.overlap}")
 
         self.demucs_source_map = DEMUCS_4_SOURCE_MAPPER
 
@@ -147,6 +139,11 @@ def separate(self, audio_file_path):
 
         self.logger.debug("Processing for all stems...")
         for stem_name, stem_value in self.demucs_source_map.items():
+            if self.output_single_stem is not None:
+                if stem_name.lower() != self.output_single_stem.lower():
+                    self.logger.debug(f"Skipping writing stem {stem_name} as output_single_stem is set to {self.output_single_stem}...")
+                    continue
+
             stem_path = os.path.join(f"{self.audio_file_base}_({stem_name})_{self.model_name}.{self.output_format.lower()}")
             stem_source = source[stem_value].T
 

diff --git a/audio_separator/separator/separator.py b/audio_separator/separator/separator.py
@@ -77,7 +77,7 @@ def __init__(
         sample_rate=44100,
         mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1, "enable_denoise": False},
         vr_params={"batch_size": 16, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False},
-        demucs_params={"selected_stem": CommonSeparator.ALL_STEMS, "segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True},
+        demucs_params={"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True},
     ):
         self.logger = logging.getLogger(__name__)
         self.logger.setLevel(log_level)
@@ -123,8 +123,6 @@ def __init__(
 
         self.output_single_stem = output_single_stem
         if output_single_stem is not None:
-            if output_single_stem.lower() not in {"instrumental", "vocals"}:
-                raise ValueError("output_single_stem must be either 'instrumental' or 'vocals'")
             self.logger.debug(f"Single stem output requested, only one output file ({output_single_stem}) will be written")
 
         self.invert_using_spec = invert_using_spec
@@ -357,8 +355,8 @@ def list_supported_model_files(self):
             "VR": model_downloads_list["vr_download_list"],
             "MDX": model_downloads_list["mdx_download_list"],
             "Demucs": filtered_demucs_v4,
-            "MDX23": model_downloads_list["mdx23_download_list"],
-            "MDX23C": model_downloads_list["mdx23c_download_list"],
+            # "MDX23": model_downloads_list["mdx23_download_list"],
+            # "MDX23C": model_downloads_list["mdx23c_download_list"],
         }
         return model_files_grouped_by_type
 

diff --git a/audio_separator/utils/cli.py b/audio_separator/utils/cli.py
@@ -36,7 +36,7 @@ def main():
     common_params = parser.add_argument_group("Common Separation Parameters")
     common_params.add_argument("--invert_spect", action="store_true", help="invert secondary stem using spectogram (default: %(default)s). Example: --invert_spect")
     common_params.add_argument("--normalization", type=float, default=0.9, help="max peak amplitude to normalize input and output audio to (default: %(default)s). Example: --normalization=0.7")
-    common_params.add_argument("--single_stem", default=None, help="output only single stem, either instrumental or vocals. Example: --single_stem=instrumental")
+    common_params.add_argument("--single_stem", default=None, help="output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental")
     common_params.add_argument("--sample_rate", type=int, default=44100, help="modify the sample rate of the output audio (default: %(default)s). Example: --sample_rate=44100")
 
     mdx_params = parser.add_argument_group("MDX Architecture Parameters")
@@ -70,7 +70,6 @@ def main():
     vr_params.add_argument("--vr_post_process_threshold", type=float, default=0.2, help="threshold for post_process feature: 0.1-0.3 (default: %(default)s). Example: --vr_post_process_threshold=0.1")
 
     demucs_params = parser.add_argument_group("Demucs Architecture Parameters")
-    demucs_params.add_argument("--demucs_stem", default="All Stems", help="stem to extract from audio file, e.g. Vocals, Drums, Bass, Other (default: %(default)s). Example: --demucs_stem=vocals")
     demucs_params.add_argument(
         "--demucs_segment_size",
         type=str,
@@ -143,7 +142,6 @@ def main():
             "high_end_process": args.vr_high_end_process,
         },
         demucs_params={
-            "selected_stem": args.demucs_stem,
             "segment_size": args.demucs_segment_size,
             "shifts": args.demucs_shifts,
             "overlap": args.demucs_overlap,

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,8 +4,8 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "audio-separator"
-version = "0.14.5"
-description = "Easy to use vocal separation, using MDX-Net models from UVR trained by @Anjok07"
+version = "0.15.0"
+description = "Easy to use audio stem separation, using various models from UVR trained primarily by @Anjok07"
 authors = ["Andrew Beveridge <[email protected]>"]
 license = "MIT"
 readme = "README.md"