Skip to content

Commit

Permalink
Speech to text implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Lokhia committed Jun 10, 2022
1 parent bb476a7 commit a98bb63
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 31 deletions.
5 changes: 4 additions & 1 deletion notebooks/audio/.env.example
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
path_to_video=<path_to_video_file>
path_to_extract=<path_to_extract_file>
path_to_audio=<path_to_audio_file>
path_to_full_movie=<path_to_full_movie_file>
path_to_trailer=<path_to_trailer_file>
2 changes: 1 addition & 1 deletion notebooks/audio/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ Ce qui implique de changer également la version de tensorflow. Pour éviter de
* Windows : (`set-executionpolicy unrestricted` si besoin) `.\venv\Scripts\activate`
* Installez les bilbiothèques indiquées dans le requirement.txt
* Exécutez normalement le code python
* Quittez venv avec `deactivate`
* Quittez venv avec `deactivate`
47 changes: 40 additions & 7 deletions notebooks/audio/gender_identification.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import os
from dotenv import load_dotenv
from inaSpeechSegmenter import Segmenter
# import streamlit as st
import pandas as pd
# from inaSpeechSegmenter.export_funcs import seg2csv, seg2textgrid
import speech_recognition as sr


class Movie:
def __init__(self, path_to_file):
def __init__(self, path_to_file, path_to_audio):
self.title = path_to_file.split(sep='\\')[-1].split(sep='.')[0]
self.media = path_to_file
self.audio = path_to_audio
self.gendered_audio_seg = self.segment() # Dataframe
self.dialogues = None
self.dialogues = self.run_speech_to_text()
self.speaking_time = self.compute_speaking_time_allocation()

def __str__(self):
Expand Down Expand Up @@ -40,7 +40,7 @@ def search_gender_tag(self, time): # Give a time in seconds
return gender

def compute_speaking_time_allocation(self):
speaking_time = {'male':0, 'female':0}
speaking_time = {'male': 0, 'female': 0}
dif = pd.Series(self.gendered_audio_seg['end']-self.gendered_audio_seg['start'], name='time_frame')
totaldf = pd.concat([self.gendered_audio_seg['gender'], dif], axis=1)
for i in totaldf.index:
Expand All @@ -50,12 +50,45 @@ def compute_speaking_time_allocation(self):
speaking_time['female'] += float(totaldf['time_frame'][i])
return speaking_time

def decode_speech(self, start_time=None, end_time=None, language="en-US"):
r = sr.Recognizer()
# r.pause_threshold = 3
# r.dynamic_energy_adjustment_damping = 0.5
# language can be "fr-FR"

with sr.WavFile(self.audio) as source:
if start_time is None and end_time is None:
audio_text = r.record(source)
else:
audio_text = r.record(source, duration=end_time - start_time, offset=start_time)

# recognize_() method will throw a request error if the API is unreachable, hence using exception handling
try:
# using google speech recognition
text = r.recognize_google(audio_text, language=language)
print('Converting audio transcripts into text ...')
return text

except:
print('Sorry.. run again...')

def run_speech_to_text(self):
transcript = []
for i in self.gendered_audio_seg.index:
transcript.append(self.decode_speech(start_time=self.gendered_audio_seg['start'][i],
end_time=self.gendered_audio_seg['end'][i],
language='fr-FR'))
transcription = pd.concat([self.gendered_audio_seg['gender'], pd.Series(transcript, name="transcription")],
axis=1)
return transcription


if __name__ == '__main__':
load_dotenv()
path_to_video = os.getenv("path_to_voice", "./")
movie = Movie(path_to_video)
path_to_video = os.getenv("path_to_extract", "./")
audio = os.getenv("path_to_audio", "./")
movie = Movie(path_to_video, audio)
print(movie.dialogues)
# """Pour convertir en tests :"""
# gender_of_time_45 = movie.search_gender_tag(45) # None
# gender_of_time_60 = movie.search_gender_tag(60) # Male
Expand Down
27 changes: 5 additions & 22 deletions notebooks/audio/media_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import moviepy.editor as mp
from dotenv import load_dotenv
import speech_recognition as sr


def cut_and_save(movie_path, start, end, target_name):
Expand All @@ -17,26 +16,9 @@ def separate_voice_and_music(file):
os.system('spleeter separate -o ../../../ -f "{instrument}/{filename}.{codec}" ' + file)


def decode_speech(wave_file, start_time=None, end_time=None, language="en-US"):
r = sr.Recognizer()
# r.pause_threshold = 3
# r.dynamic_energy_adjustment_damping = 0.5

with sr.WavFile(wave_file) as source:
if start_time is None and end_time is None:
audio_text = r.record(source)
else:
audio_text = r.record(source, duration=end_time - start_time, offset=start_time)

# recognize_() method will throw a request error if the API is unreachable, hence using exception handling
try:
# using google speech recognition
text = r.recognize_google(audio_text, language=language)
print('Converting audio transcripts into text ...')
return text

except:
print('Sorry.. run again...')
def extract_audio_from_movie(file, extension='.wav'):
clip = import_as_clip(file)
clip.audio.write_audiofile(file.split(sep='.')[0] + extension)


if __name__ == '__main__':
Expand All @@ -46,7 +28,8 @@ def decode_speech(wave_file, start_time=None, end_time=None, language="en-US"):
path_to_extract = os.getenv("path_to_extract", "./")
path_to_trailer = os.getenv("path_to_trailer", "./")

separate_voice_and_music(path_to_extract)
extract_audio_from_movie(path_to_extract)
# separate_voice_and_music(path_to_extract)

# cut_and_save(path_to_full_movie, 2115, 2491, path_to_extract)

Expand Down
105 changes: 105 additions & 0 deletions notebooks/audio/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
absl-py==0.15.0
anyio==3.6.1
appdirs==1.4.4
astunparse==1.6.3
audioread==2.1.9
cachetools==5.2.0
certifi==2022.5.18.1
cffi==1.15.0
charset-normalizer==2.0.12
click==7.1.2
colorama==0.4.4
cycler==0.11.0
decorator==4.4.2
docopt==0.6.2
ffmpeg-python==0.2.0
flatbuffers==1.12
fonttools==4.33.3
future==0.18.2
gast==0.4.0
google-auth==2.7.0
google-auth-oauthlib==0.4.6
google-pasta==0.2.0
grpcio==1.34.1
h11==0.12.0
h2==4.1.0
h5py==3.1.0
hpack==4.0.0
httpcore==0.13.7
httpx==0.19.0
hyperframe==6.0.1
idna==3.3
imageio==2.19.3
imageio-ffmpeg==0.4.7
importlib-metadata==4.11.4
inaSpeechSegmenter==0.7.3
joblib==1.1.0
keras==2.9.0
keras-nightly==2.5.0.dev2021032900
Keras-Preprocessing==1.1.2
kiwisolver==1.4.2
libclang==14.0.1
librosa==0.8.0
llvmlite==0.36.0
Markdown==3.3.7
matplotlib==3.5.2
moviepy==1.0.3
munkres==1.1.4
networkx==2.8.3
norbert==0.2.1
numba==0.53.1
numpy==1.19.5
oauthlib==3.2.0
opt-einsum==3.3.0
packaging==21.3
pandas==1.4.2
Pillow==9.1.1
pooch==1.6.0
proglog==0.1.10
protobuf==3.19.4
pyannote.algorithms==0.8
pyannote.core==4.4
pyannote.parser==0.8
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycparser==2.21
pyparsing==3.0.9
Pyro4==4.82
pytextgrid==0.1.4
python-dateutil==2.8.2
python-dotenv==0.20.0
pytz==2022.1
PyWavelets==1.3.0
requests==2.28.0
requests-oauthlib==1.3.1
resampy==0.2.2
rfc3986==1.5.0
rsa==4.8
scikit-image==0.19.2
scikit-learn==1.1.1
scipy==1.8.1
serpent==1.40
simplejson==3.17.6
six==1.15.0
sniffio==1.2.0
sortedcollections==2.1.0
sortedcontainers==2.4.0
SoundFile==0.10.3.post1
SpeechRecognition==3.8.1
spleeter==2.3.0
tensorboard==2.9.1
tensorboard-data-server==0.6.1
tensorboard-plugin-wit==1.8.1
tensorflow==2.5.0
tensorflow-estimator==2.5.0
termcolor==1.1.0
threadpoolctl==3.1.0
tifffile==2022.5.4
tqdm==4.64.0
typer==0.3.2
typing-extensions==3.7.4.3
urllib3==1.26.9
Werkzeug==2.1.2
wrapt==1.12.1
xarray==2022.3.0
zipp==3.8.0

0 comments on commit a98bb63

Please sign in to comment.