Merge pull request #246 from m-bain/v3

V3
m-bain · May 13, 2023 · d8a2b4f · d8a2b4f
2 parents 46b4162 + 9ffb7e7
commit d8a2b4f
Show file tree

Hide file tree

Showing 7 changed files with 18 additions and 119 deletions.
diff --git a/Dockerfile b/Dockerfile
diff --git a/README.md b/README.md
@@ -32,12 +32,12 @@
 <!-- <h2 align="left", id="what-is-it">What is it 🔎</h2> -->
 
 
-This repository provides fast automatic speaker recognition (70x realtime with large-v2) with word-level timestamps and speaker diarization.
+This repository provides fast automatic speech recognition (70x realtime with large-v2) with word-level timestamps and speaker diarization.
 
 - ⚡️ Batched inference for 70x realtime transcription using whisper large-v2
 - 🪶 [faster-whisper](https://github.com/guillaumekln/faster-whisper) backend, requires <8GB gpu memory for large-v2 with beam_size=5
 - 🎯 Accurate word-level timestamps using wav2vec2 alignment
-- 👯‍♂️ Multispeaker ASR using speaker diarization from [pyannote-audio](https://github.com/pyannote/pyannote-audio) (labels each segment/word with speaker ID) 
+- 👯‍♂️ Multispeaker ASR using speaker diarization from [pyannote-audio](https://github.com/pyannote/pyannote-audio) (speaker ID labels) 
 - 🗣️ VAD preprocessing, reduces hallucination & batching with no WER degradation
 
 
@@ -74,9 +74,9 @@ GPU execution requires the NVIDIA libraries cuBLAS 11.x and cuDNN 8.x to be inst
 
 ### 2. Install PyTorch2.0, e.g. for Linux and Windows CUDA11.7:
 
-`pip3 install torch torchvision torchaudio`
+`conda install pytorch==2.0.0 torchvision==0.15.0 torchaudio==2.0.0 pytorch-cuda=11.7 -c pytorch -c nvidia`
 
-See other methods [here.](https://pytorch.org/get-started/locally/)
+See other methods [here.](https://pytorch.org/get-started/previous-versions/#v200)
 
 ### 3. Install this repo
 

diff --git a/notebooks/whisperx.ipynb b/notebooks/whisperx.ipynb
diff --git a/setup.py b/setup.py
@@ -6,8 +6,8 @@
 setup(
     name="whisperx",
     py_modules=["whisperx"],
-    version="3.1.0",
-    description="Time-Accurate Automatic Speech Recognition.",
+    version="3.1.1",
+    description="Time-Accurate Automatic Speech Recognition using Whisper.",
     readme="README.md",
     python_requires=">=3.8",
     author="Max Bain",

diff --git a/whisperx/alignment.py b/whisperx/alignment.py
@@ -261,6 +261,10 @@ def align(
                 word_text = "".join(word_chars["char"].tolist()).strip()
                 if len(word_text) == 0:
                     continue
+
+                # dont use space character for alignment
+                word_chars = word_chars[word_chars["char"] != " "]
+
                 word_start = word_chars["start"].min()
                 word_end = word_chars["end"].max()
                 word_score = round(word_chars["score"].mean(), 3)

diff --git a/whisperx/asr.py b/whisperx/asr.py
@@ -14,7 +14,7 @@
 from .types import TranscriptionResult, SingleSegment
 
 def load_model(whisper_arch, device, compute_type="float16", asr_options=None, language=None,
-               vad_options=None, model=None):
+               vad_options=None, model=None, task="transcribe"):
     '''Load a Whisper model for inference.
     Args:
         whisper_arch: str - The name of the Whisper model to load.
@@ -31,7 +31,7 @@ def load_model(whisper_arch, device, compute_type="float16", asr_options=None, l
 
     model = WhisperModel(whisper_arch, device=device, compute_type=compute_type)
     if language is not None:
-        tokenizer = faster_whisper.tokenizer.Tokenizer(model.hf_tokenizer, model.model.is_multilingual, task="transcribe", language=language)
+        tokenizer = faster_whisper.tokenizer.Tokenizer(model.hf_tokenizer, model.model.is_multilingual, task=task, language=language)
     else:
         print("No language specified, language will be first be detected for each audio file (increases inference time).")
         tokenizer = None