Skip to content

Commit

Permalink
Add VoxConverse recipe (#1142)
Browse files Browse the repository at this point in the history
* Add VoxConverse recipe

* Do not resplit the dataset by default

---------

Co-authored-by: Piotr Żelasko <[email protected]>
  • Loading branch information
flyingleafe and pzelasko authored Sep 14, 2023
1 parent b98521b commit 1389de4
Show file tree
Hide file tree
Showing 5 changed files with 180 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_vctk`
* - VoxCeleb
- :func:`lhotse.recipes.prepare_voxceleb`
* - VoxConverse
- :func:`lhotse.recipes.prepare_voxconverse`
* - VoxPopuli
- :func:`lhotse.recipes.prepare_voxpopuli`
* - WenetSpeech
Expand Down
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
from .uwb_atcc import *
from .vctk import *
from .voxceleb import *
from .voxconverse import *
from .voxpopuli import *
from .wenet_speech import *
from .xbmu_amdo31 import *
Expand Down
27 changes: 27 additions & 0 deletions lhotse/bin/modes/recipes/voxconverse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import click

from lhotse.bin.modes import download, prepare
from lhotse.recipes.voxconverse import download_voxconverse, prepare_voxconverse
from lhotse.utils import Pathlike


@download.command(context_settings=dict(show_default=True))
@click.argument("target_dir", type=click.Path())
@click.option("--force-download", is_flag=True, default=False, help="Force download")
def voxconverse(target_dir: Pathlike, force_download=False):
"""VoxConverse dataset download."""
download_voxconverse(target_dir, force_download=force_download)


@prepare.command(context_settings=dict(show_default=True))
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
@click.argument("output_dir", type=click.Path())
@click.option(
"--split-test",
is_flag=True,
default=False,
help="Split test part into dev and test parts",
)
def voxconverse(corpus_dir: Pathlike, output_dir: Pathlike, split_test: bool = False):
"""VoxConverse data preparation."""
prepare_voxconverse(corpus_dir, output_dir=output_dir, split_test=split_test)
1 change: 1 addition & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
from .uwb_atcc import download_uwb_atcc, prepare_uwb_atcc
from .vctk import download_vctk, prepare_vctk
from .voxceleb import download_voxceleb1, download_voxceleb2, prepare_voxceleb
from .voxconverse import download_voxconverse, prepare_voxconverse
from .voxpopuli import download_voxpopuli, prepare_voxpopuli
from .wenet_speech import prepare_wenet_speech
from .xbmu_amdo31 import download_xbmu_amdo31, prepare_xbmu_amdo31
Expand Down
149 changes: 149 additions & 0 deletions lhotse/recipes/voxconverse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
"""
VoxConverse is an audio-visual diarisation dataset consisting of multispeaker clips of human speech, extracted from YouTube videos.
Updates and additional information about the dataset can be found at our website (https://www.robots.ox.ac.uk/~vgg/data/voxconverse/index.html).
"""

import json
import logging
import re
import shutil
import zipfile
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from typing import Dict, Iterable, Optional, Union
from urllib.error import HTTPError

from tqdm.auto import tqdm

from lhotse import fix_manifests, validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, is_module_available, resumable_download

DEV_AUDIO_ZIP = (
"https://www.robots.ox.ac.uk/~vgg/data/voxconverse/data/voxconverse_dev_wav.zip"
)
TEST_AUDIO_ZIP = (
"https://www.robots.ox.ac.uk/~vgg/data/voxconverse/data/voxconverse_test_wav.zip"
)
ANNOTATIONS_ZIP = "https://github.com/joonson/voxconverse/archive/master.zip"


def download_voxconverse(
corpus_dir: Pathlike,
force_download: bool = False,
):
corpus_dir = Path(corpus_dir)
corpus_dir.mkdir(parents=True, exist_ok=True)
completed_detector = corpus_dir / ".completed"

if not completed_detector.is_file() or force_download:
print("Downloading VoxConverse dev set")
resumable_download(DEV_AUDIO_ZIP, corpus_dir / "dev.zip")
with zipfile.ZipFile(corpus_dir / "dev.zip") as zip_f:
zip_f.extractall(corpus_dir / "dev")

shutil.copytree(
corpus_dir / "dev/audio", corpus_dir / "dev", dirs_exist_ok=True
)
shutil.rmtree(corpus_dir / "dev/audio")

print("Downloading VoxConverse test set")
resumable_download(TEST_AUDIO_ZIP, corpus_dir / "test.zip")
with zipfile.ZipFile(corpus_dir / "test.zip") as zip_f:
zip_f.extractall(corpus_dir / "test")

shutil.copytree(
corpus_dir / "test/voxconverse_test_wav",
corpus_dir / "test",
dirs_exist_ok=True,
)
shutil.rmtree(corpus_dir / "test/voxconverse_test_wav")

print("Downloading VoxConverse annotations")
resumable_download(ANNOTATIONS_ZIP, corpus_dir / "annotations.zip")
with zipfile.ZipFile(corpus_dir / "annotations.zip") as zip_f:
zip_f.extractall(corpus_dir)

shutil.copytree(
corpus_dir / "voxconverse-master", corpus_dir, dirs_exist_ok=True
)
shutil.rmtree(corpus_dir / "voxconverse-master")

# cleanup
(corpus_dir / "dev.zip").unlink()
(corpus_dir / "test.zip").unlink()
(corpus_dir / "annotations.zip").unlink()
completed_detector.touch()

print("Done")


def prepare_voxconverse(
corpus_dir: Pathlike,
output_dir: Optional[Pathlike] = None,
split_test: bool = False, # test part is larger than dev part - split it into dev and test by default
):
corpus_dir = Path(corpus_dir).absolute()

splits = {}
if split_test:
splits["train"] = sorted((corpus_dir / "dev").glob("*.wav"))
test_files = sorted((corpus_dir / "test").glob("*.wav"))
splits["dev"] = test_files[: len(test_files) // 2]
splits["test"] = test_files[len(test_files) // 2 :]
else:
splits["dev"] = sorted((corpus_dir / "dev").glob("*.wav"))
splits["test"] = sorted((corpus_dir / "test").glob("*.wav"))

manifests = {}
for subset, wavs in splits.items():
recordings = []
supervisions = []
for wav_file in wavs:
recordings.append(Recording.from_file(wav_file))
rttm_file = wav_file.with_suffix("").with_suffix(".rttm")
for ix, (start, duration, speaker) in enumerate(_read_rttm(rttm_file)):
supervisions.append(
SupervisionSegment(
id=f"{wav_file.stem}-{ix}",
recording_id=wav_file.stem,
start=start,
duration=duration,
channel=0,
language="en",
speaker=speaker,
)
)

recording_set = RecordingSet.from_recordings(recordings)
supervision_set = SupervisionSet.from_segments(supervisions)
recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
validate_recordings_and_supervisions(recording_set, supervision_set)

if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
recording_set.to_file(
output_dir / f"voxconverse_recordings_{subset}.jsonl.gz"
)
supervision_set.to_file(
output_dir / f"voxconverse_supervisions_{subset}.jsonl.gz"
)

manifests[subset] = {
"recordings": recording_set,
"supervisions": supervision_set,
}

return manifests


def _read_rttm(filename):
with open(filename, "r") as f:
for line in f.readlines():
line = line.strip()
if line.startswith("SPEAKER"):
_, _, _, start, duration, _, _, speaker, _, _ = line.split()
yield float(start), float(duration), speaker

0 comments on commit 1389de4

Please sign in to comment.