Speech to text implementation

dataforgoodfr · Jun 10, 2022 · a98bb63 · a98bb63
1 parent bb476a7
commit a98bb63
Show file tree

Hide file tree

Showing 5 changed files with 155 additions and 31 deletions.
diff --git a/notebooks/audio/.env.example b/notebooks/audio/.env.example
@@ -1 +1,4 @@
-path_to_video=<path_to_video_file>
+path_to_extract=<path_to_extract_file>
+path_to_audio=<path_to_audio_file>
+path_to_full_movie=<path_to_full_movie_file>
+path_to_trailer=<path_to_trailer_file>
diff --git a/notebooks/audio/README.md b/notebooks/audio/README.md
@@ -17,4 +17,4 @@ Ce qui implique de changer également la version de tensorflow. Pour éviter de
     * Windows : (`set-executionpolicy unrestricted` si besoin) `.\venv\Scripts\activate`
   * Installez les bilbiothèques indiquées dans le requirement.txt
   * Exécutez normalement le code python
-  * Quittez venv avec `deactivate`
+  * Quittez venv avec `deactivate`
diff --git a/notebooks/audio/gender_identification.py b/notebooks/audio/gender_identification.py
@@ -1,17 +1,17 @@
 import os
 from dotenv import load_dotenv
 from inaSpeechSegmenter import Segmenter
-# import streamlit as st
 import pandas as pd
-# from inaSpeechSegmenter.export_funcs import seg2csv, seg2textgrid
+import speech_recognition as sr
 
 
 class Movie:
-    def __init__(self, path_to_file):
+    def __init__(self, path_to_file, path_to_audio):
         self.title = path_to_file.split(sep='\\')[-1].split(sep='.')[0]
         self.media = path_to_file
+        self.audio = path_to_audio
         self.gendered_audio_seg = self.segment()  # Dataframe
-        self.dialogues = None
+        self.dialogues = self.run_speech_to_text()
         self.speaking_time = self.compute_speaking_time_allocation()
 
     def __str__(self):
@@ -40,7 +40,7 @@ def search_gender_tag(self, time):  # Give a time in seconds
         return gender
 
     def compute_speaking_time_allocation(self):
-        speaking_time = {'male':0, 'female':0}
+        speaking_time = {'male': 0, 'female': 0}
         dif = pd.Series(self.gendered_audio_seg['end']-self.gendered_audio_seg['start'], name='time_frame')
         totaldf = pd.concat([self.gendered_audio_seg['gender'], dif], axis=1)
         for i in totaldf.index:
@@ -50,12 +50,45 @@ def compute_speaking_time_allocation(self):
                 speaking_time['female'] += float(totaldf['time_frame'][i])
         return speaking_time
 
+    def decode_speech(self, start_time=None, end_time=None, language="en-US"):
+        r = sr.Recognizer()
+        # r.pause_threshold = 3
+        # r.dynamic_energy_adjustment_damping = 0.5
+        # language can be "fr-FR"
+
+        with sr.WavFile(self.audio) as source:
+            if start_time is None and end_time is None:
+                audio_text = r.record(source)
+            else:
+                audio_text = r.record(source, duration=end_time - start_time, offset=start_time)
+
+            # recognize_() method will throw a request error if the API is unreachable, hence using exception handling
+            try:
+                # using google speech recognition
+                text = r.recognize_google(audio_text, language=language)
+                print('Converting audio transcripts into text ...')
+                return text
+
+            except:
+                print('Sorry.. run again...')
+
+    def run_speech_to_text(self):
+        transcript = []
+        for i in self.gendered_audio_seg.index:
+            transcript.append(self.decode_speech(start_time=self.gendered_audio_seg['start'][i],
+                                                 end_time=self.gendered_audio_seg['end'][i],
+                                                 language='fr-FR'))
+        transcription = pd.concat([self.gendered_audio_seg['gender'], pd.Series(transcript, name="transcription")],
+                                  axis=1)
+        return transcription
 
 
 if __name__ == '__main__':
     load_dotenv()
-    path_to_video = os.getenv("path_to_voice", "./")
-    movie = Movie(path_to_video)
+    path_to_video = os.getenv("path_to_extract", "./")
+    audio = os.getenv("path_to_audio", "./")
+    movie = Movie(path_to_video, audio)
+    print(movie.dialogues)
     # """Pour convertir en tests :"""
     # gender_of_time_45 = movie.search_gender_tag(45)  # None
     # gender_of_time_60 = movie.search_gender_tag(60)  # Male

diff --git a/notebooks/audio/media_tools.py b/notebooks/audio/media_tools.py
@@ -2,7 +2,6 @@
 from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
 import moviepy.editor as mp
 from dotenv import load_dotenv
-import speech_recognition as sr
 
 
 def cut_and_save(movie_path, start, end, target_name):
@@ -17,26 +16,9 @@ def separate_voice_and_music(file):
     os.system('spleeter separate -o ../../../ -f "{instrument}/{filename}.{codec}" ' + file)
 
 
-def decode_speech(wave_file, start_time=None, end_time=None, language="en-US"):
-    r = sr.Recognizer()
-    # r.pause_threshold = 3
-    # r.dynamic_energy_adjustment_damping = 0.5
-
-    with sr.WavFile(wave_file) as source:
-        if start_time is None and end_time is None:
-            audio_text = r.record(source)
-        else:
-            audio_text = r.record(source, duration=end_time - start_time, offset=start_time)
-
-        # recognize_() method will throw a request error if the API is unreachable, hence using exception handling
-        try:
-            # using google speech recognition
-            text = r.recognize_google(audio_text, language=language)
-            print('Converting audio transcripts into text ...')
-            return text
-
-        except:
-            print('Sorry.. run again...')
+def extract_audio_from_movie(file, extension='.wav'):
+    clip = import_as_clip(file)
+    clip.audio.write_audiofile(file.split(sep='.')[0] + extension)
 
 
 if __name__ == '__main__':
@@ -46,7 +28,8 @@ def decode_speech(wave_file, start_time=None, end_time=None, language="en-US"):
     path_to_extract = os.getenv("path_to_extract", "./")
     path_to_trailer = os.getenv("path_to_trailer", "./")
 
-    separate_voice_and_music(path_to_extract)
+    extract_audio_from_movie(path_to_extract)
+    # separate_voice_and_music(path_to_extract)
 
     # cut_and_save(path_to_full_movie, 2115, 2491, path_to_extract)
 

diff --git a/notebooks/audio/requirements.txt b/notebooks/audio/requirements.txt
@@ -0,0 +1,105 @@
+absl-py==0.15.0
+anyio==3.6.1
+appdirs==1.4.4
+astunparse==1.6.3
+audioread==2.1.9
+cachetools==5.2.0
+certifi==2022.5.18.1
+cffi==1.15.0
+charset-normalizer==2.0.12
+click==7.1.2
+colorama==0.4.4
+cycler==0.11.0
+decorator==4.4.2
+docopt==0.6.2
+ffmpeg-python==0.2.0
+flatbuffers==1.12
+fonttools==4.33.3
+future==0.18.2
+gast==0.4.0
+google-auth==2.7.0
+google-auth-oauthlib==0.4.6
+google-pasta==0.2.0
+grpcio==1.34.1
+h11==0.12.0
+h2==4.1.0
+h5py==3.1.0
+hpack==4.0.0
+httpcore==0.13.7
+httpx==0.19.0
+hyperframe==6.0.1
+idna==3.3
+imageio==2.19.3
+imageio-ffmpeg==0.4.7
+importlib-metadata==4.11.4
+inaSpeechSegmenter==0.7.3
+joblib==1.1.0
+keras==2.9.0
+keras-nightly==2.5.0.dev2021032900
+Keras-Preprocessing==1.1.2
+kiwisolver==1.4.2
+libclang==14.0.1
+librosa==0.8.0
+llvmlite==0.36.0
+Markdown==3.3.7
+matplotlib==3.5.2
+moviepy==1.0.3
+munkres==1.1.4
+networkx==2.8.3
+norbert==0.2.1
+numba==0.53.1
+numpy==1.19.5
+oauthlib==3.2.0
+opt-einsum==3.3.0
+packaging==21.3
+pandas==1.4.2
+Pillow==9.1.1
+pooch==1.6.0
+proglog==0.1.10
+protobuf==3.19.4
+pyannote.algorithms==0.8
+pyannote.core==4.4
+pyannote.parser==0.8
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycparser==2.21
+pyparsing==3.0.9
+Pyro4==4.82
+pytextgrid==0.1.4
+python-dateutil==2.8.2
+python-dotenv==0.20.0
+pytz==2022.1
+PyWavelets==1.3.0
+requests==2.28.0
+requests-oauthlib==1.3.1
+resampy==0.2.2
+rfc3986==1.5.0
+rsa==4.8
+scikit-image==0.19.2
+scikit-learn==1.1.1
+scipy==1.8.1
+serpent==1.40
+simplejson==3.17.6
+six==1.15.0
+sniffio==1.2.0
+sortedcollections==2.1.0
+sortedcontainers==2.4.0
+SoundFile==0.10.3.post1
+SpeechRecognition==3.8.1
+spleeter==2.3.0
+tensorboard==2.9.1
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorflow==2.5.0
+tensorflow-estimator==2.5.0
+termcolor==1.1.0
+threadpoolctl==3.1.0
+tifffile==2022.5.4
+tqdm==4.64.0
+typer==0.3.2
+typing-extensions==3.7.4.3
+urllib3==1.26.9
+Werkzeug==2.1.2
+wrapt==1.12.1
+xarray==2022.3.0
+zipp==3.8.0