Fix tests

voice-cloning-app · Nov 28, 2021 · 0bf3570 · 0bf3570
1 parent 9fcd00e
commit 0bf3570
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 12 deletions.
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -225,31 +225,26 @@ def test_clip_combiner():
 def test_extend_existing_dataset():
     dataset_directory = "test-extend-dataset"
     audio_folder = os.path.join(dataset_directory, "wavs")
-    unlabelled_path = os.path.join(dataset_directory, "unlabelled")
+    unlabelled_folder = os.path.join(dataset_directory, "unlabelled")
     metadata_file = os.path.join(dataset_directory, "metadata.csv")
     os.makedirs(dataset_directory)
     os.makedirs(audio_folder)
+    os.makedirs(unlabelled_folder)
     with open(metadata_file, "w") as f:
         pass
 
     audio_path = os.path.join("test_samples", "audio.wav")
     converted_audio_path = os.path.join("test_samples", "audio-converted.wav")
     text_path = os.path.join("test_samples", "text.txt")
-    forced_alignment_path = os.path.join(dataset_directory, "align.json")
     label_path = os.path.join(dataset_directory, "metadata.csv")
-    info_path = os.path.join(dataset_directory, "info.json")
     suffix = "extend"
     min_confidence = 1.0
     extend_existing_dataset(
         text_path=text_path,
         audio_path=audio_path,
         transcription_model=FakeTranscriptionModel(),
-        forced_alignment_path=forced_alignment_path,
-        output_path=audio_folder,
-        unlabelled_path=unlabelled_path,
-        label_path=label_path,
+        output_folder=dataset_directory,
         suffix=suffix,
-        info_path=info_path,
         min_confidence=min_confidence,
         combine_clips=False,
     )
@@ -258,6 +253,10 @@ def test_extend_existing_dataset():
         name.split(".")[0] + "-" + suffix + ".wav" for name in EXPECTED_CLIPS
     ], "Unexpected audio clips"
 
+    assert os.listdir(unlabelled_folder) == [
+        name.split(".")[0] + "-" + suffix + ".wav" for name in UNMATCHED_CLIPS
+    ], "Unexpected unlabelled audio clips"
+
     with open(label_path) as f:
         lines = f.readlines()
         expected_text = [

diff --git a/training/clean_text.py b/training/clean_text.py
@@ -36,7 +36,7 @@
 }
 
 
-def clean_text(text, symbols=DEFAULT_ALPHABET):
+def clean_text(text, symbols=DEFAULT_ALPHABET, remove_invalid_characters=True):
     """
     Cleans text. This includes:
     - Replacing monetary terms (i.e. $ -> dollars)
@@ -49,8 +49,10 @@ def clean_text(text, symbols=DEFAULT_ALPHABET):
     ----------
     text : str
         Text to clean
-    symbols : list
+    symbols : list (optional)
         List of valid symbols in text (default is English alphabet & punctuation)
+    remove_invalid_characters : bool (optional)
+        Whether to remove characters not in symbols list (default is True)
 
     Returns
     -------
@@ -83,7 +85,8 @@ def clean_text(text, symbols=DEFAULT_ALPHABET):
     # Collapse whitespace
     text = re.sub(WHITESPACE_RE, " ", text)
     # Remove banned characters
-    text = "".join([c for c in text if c in symbols])
+    if remove_invalid_characters:
+        text = "".join([c for c in text if c in symbols])
     return text
 
 

diff --git a/training/utils.py b/training/utils.py
@@ -100,7 +100,7 @@ def validate_dataset(filepaths_and_text, dataset_directory, symbols):
     invalid_characters = set()
     wavs = os.listdir(dataset_directory)
     for filename, text in filepaths_and_text:
-        text = clean_text(text, symbols)
+        text = clean_text(text, remove_invalid_characters=False)
         if filename not in wavs:
             missing_files.add(filename)
         invalid_characters_for_row = get_invalid_characters(text, symbols)