Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/voice analysis #30

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
5ec016a
Adapt artpech's imports for ina speech segmenter to python and poetry…
Lokhia Jun 6, 2022
b0d00fd
Gendered audio segmentation based on inaSpeechSegmenter and some expe…
Lokhia Jun 10, 2022
db5f243
Speaking time according to gender implemented
Lokhia Jun 10, 2022
eac60f9
Speech to text implementation
Lokhia Jun 10, 2022
1429e72
Extract to csv now available
Lokhia Jun 10, 2022
f9ffc8e
Doing full movie pipeline
Lokhia Jun 11, 2022
bfb61a1
Add Whisper automatic speak recognition
DnzzL Feb 2, 2023
fe3f487
Extract from notebook
DnzzL Feb 1, 2023
29a6e08
[FIX] Output length + types
DnzzL Feb 15, 2023
ff0f003
Small renaming and adding docstrings
Lokhia Feb 22, 2023
63d235d
Poetry update and some audio tests
Lokhia Feb 22, 2023
35043f4
Refactoring audio processing package - transcribers gestion
Lokhia Feb 22, 2023
3b95203
Refactoring audio processing - gender segmenter gestion
Lokhia Feb 23, 2023
e6f0b5f
Refactoring audio processing - dialogue tagger gestion
Lokhia Feb 23, 2023
9da1078
Refactoring audio processing - Audio Processor, main and poetry depen…
Lokhia Feb 23, 2023
7dee943
Merge remote-tracking branch 'origin/main' into feature/voice_analysis
Lokhia Feb 23, 2023
4ca2cd4
Archive previous audio notebook work
Lokhia Feb 26, 2023
907cdfe
Update an properly merge pyproject from main
Lokhia Feb 26, 2023
493e531
Minor changes in speech to text to make it work with default API key :)
Lokhia Feb 26, 2023
70d32bb
Updating poetry lock and toml
Lokhia Feb 27, 2023
1cfab8f
Transform all audio code to functionable library with tutorial
Lokhia Mar 7, 2023
c620f9a
Include Us English profile and tutorial
Lokhia Mar 7, 2023
af4dfd0
Added whisper API in the pipeline
TheoLvs Mar 8, 2023
73e5269
Updated demo
TheoLvs Mar 27, 2023
2bda1d5
Remove deprecated code
DnzzL Oct 17, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Extract from notebook
Add comments
DnzzL committed Feb 13, 2023
commit fe3f487a6229571a786f5764a80a7898206aaf83
86 changes: 86 additions & 0 deletions bechdelai/audio/gender_identifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import pandas as pd
import speech_recognition as sr
from dotenv import load_dotenv
from inaSpeechSegmenter import Segmenter


class GenderAudioIdentifier:
def __init__(self, path_to_file, path_to_audio):
self.title = path_to_file.split(sep='\\')[-1].split(sep='.')[0]
self.media = path_to_file
self.audio = path_to_audio
self.gendered_audio_seg = self.segment() # Dataframe
self.dialogues = self.run_speech_to_text()
self.speaking_time = self.compute_speaking_time_allocation()

def __str__(self):
return "Film : {}".format(self.title)

def __repr__(self):
return self.title

def segment(self):
seg = Segmenter(vad_engine='sm', energy_ratio=0.05)
# energy ratio : the higher, the more selective ; vad_engine : works better with sm than smn
segment = seg(self.media)
return pd.DataFrame(list(filter(lambda x: x[0] == 'male' or x[0] == 'female', segment)),
columns=['gender', 'start', 'end'])

def search_gender_tag(self, time: int): # Give a time in seconds
gender = None
if time > self.gendered_audio_seg['end'].tail(1).item():
return None
for i in self.gendered_audio_seg.index:
if time > self.gendered_audio_seg['start'][i]:
if time < self.gendered_audio_seg['end'][i]:
gender = self.gendered_audio_seg['gender'][i]
if time > self.gendered_audio_seg['end'][i]:
pass
return gender

def compute_speaking_time_allocation(self):
speaking_time = {'male': 0, 'female': 0}
dif = pd.Series(self.gendered_audio_seg['end'] - self.gendered_audio_seg['start'], name='time_frame')
totaldf = pd.concat([self.gendered_audio_seg['gender'], dif], axis=1)
for i in totaldf.index:
if totaldf['gender'][i] == 'male':
speaking_time['male'] += float(totaldf['time_frame'][i])
if totaldf['gender'][i] == 'female':
speaking_time['female'] += float(totaldf['time_frame'][i])
return speaking_time

def decode_speech(self, start_time=None, end_time=None, language="en-US"):
r = sr.Recognizer()
# r.pause_threshold = 3
# r.dynamic_energy_adjustment_damping = 0.5
# language can be "fr-FR"

with sr.WavFile(self.audio) as source:
if start_time is None and end_time is None:
audio_text = r.record(source)
else:
audio_text = r.record(source, duration=end_time - start_time, offset=start_time)

# recognize_() method will throw a request error if the API is unreachable, hence using exception handling
try:
# using google speech recognition
text = r.recognize_google(audio_text, language=language)
print('Converting audio transcripts into text ...')
return text

except:
print('Sorry.. run again...')

def run_speech_to_text(self):
transcript = []
for i in self.gendered_audio_seg.index:
transcript.append(self.decode_speech(start_time=self.gendered_audio_seg['start'][i],
end_time=self.gendered_audio_seg['end'][i],
language='fr-FR'))
transcription = pd.concat([self.gendered_audio_seg['gender'], pd.Series(transcript, name="transcription")],
axis=1)
return transcription

def export_to_csv(self, file_path: str):
result = pd.concat([self.gendered_audio_seg, self.dialogues['transcription']], axis=1)
result.to_csv(path_or_buf=file_path, sep=";", header=True, index=False)
37 changes: 37 additions & 0 deletions bechdelai/audio/speech_recognition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from transformers import pipeline


class SpeechRecognition:
"""Speech recognition model for audio files."""

def __init__(self, model_name="openai/whisper-small"):
"""Initialize speech recognition model.

Args:
language (str): target language
task (str): transcribe for same language or translate to another language
model_name (str): Whisper model name. Defaults to "openai/whisper-small".
"""
self.pipe = pipeline(
task="automatic-speech-recognition",
model=model_name,
chunk_length_s=30,
stride_length_s=(5, 5),
return_timestamps=True,
)

def transcribe(self, audio_path, language, task="transcribe"):
"""Transcribe audio file.

Args:
audio_path (): Path to audio file
language (str): target language
task (str): transcribe for same language or translate to another language

Returns:
Dict: Transcribed text
"""
self.pipe.model.config.forced_decoder_ids = (
self.pipe.tokenizer.get_decoder_prompt_ids(language=language, task=task)
)
return self.pipe(audio_path)
50 changes: 50 additions & 0 deletions bechdelai/audio/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os

import moviepy.editor as mp
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip


def cut_and_save(movie_path: str, start: float, end: float, target_name: str) -> None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Il faudrait qu'on rajoute tous les utils sur une vidéo dans la classe bechdelai.video.video.Video

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pareil qu'avant, on n'a qu'à virer ça pour l'instant, c'est juste des fonctions dont j'avais parfois besoin.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Up ? :)

"""This function cuts a video from the start to the end time and saves it as target_name.

Args:
movie_path (str): The path to the video file.
start (float): The start time in seconds.
end (float): The end time in seconds.
target_name (str): The file name of the new video file.

Returns:
None
"""
return ffmpeg_extract_subclip(movie_path, start, end, targetname=target_name)


def import_as_clip(path_to_video: str) -> mp.VideoFileClip:
"""Imports a video file as a VideoFileClip object.

Args:
path_to_video (str): Path to a video file.

Returns:
mp.VideoFileClip: VideoFileClip object.
"""
return mp.VideoFileClip(path_to_video)


# Splits a file into its individual parts using spleeter
# Does not work above 700 seconds
def separate_voice_and_music(file: str) -> None: # Do not work above 700 seconds
os.system('spleeter separate -d 700.0 -o ../../../ -f "{instrument}/{filename}.{codec}" ' + file)


def extract_audio_from_movie(file: str, extension: str = '.wav') -> None:
"""Extract the audio from a movie and save it to a file.

The audio is saved in the same directory as the movie.

Args:
file (str): The name of the movie file to extract the audio from.
extension (str): The file extension of the audio file to save.
"""
clip = import_as_clip(file)
clip.audio.write_audiofile(file.split(sep='.')[0] + extension)
162 changes: 162 additions & 0 deletions notebooks/audio/whisper_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
{
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Le notebook n'est plus à jour non ? Les imports ne fonctionnent plus

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aucune idée, @DnzzL ?

"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from bechdelai.data.youtube import download_youtube_video"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"youtube_trailer_url = \"https://www.youtube.com/watch?v=EzWIsGqeoVQ\"\n",
"output_filename = \"raid.mp4\"\n",
"youtube_language = \"fr-FR\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Task Completed!\n"
]
}
],
"source": [
"download_youtube_video(youtube_trailer_url, output_filename, youtube_language)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/thomas/miniconda3/envs/bechdelai/lib/python3.9/site-packages/transformers/generation/utils.py:1273: UserWarning: Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to 448 (`generation_config.max_length`). Controlling `max_length` via the config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"{'text': \" Leur Raid, l'élite de la police, des super-agences sur-entraînées. Leur devise, servir sans faillir. Suivant. Bonjour. Ah, c'est pour le secrétairelien, l'infirmiereau ou la cantine ? Je suis là pour le groupe d'intervention du Raid. C'est un danger pour le groupe, une gonzesse. Une femme froissale. Ça crie, ça chiale, ça se pète les ondes, ça se poince les cheveux dans le casque. Bon, on a une femme, c'est comme ça. Oh non ! Puis ça, tout le temps, envie de pisser. On n'a pas les combinaisons adaptées pour les pisseuses. Effectivement, vous n'êtes pas du tout macho. Je me suis trompé. Mais ? Vous êtes 16. Y a 4 lits par chambre. Faites-moi tout de suite 4 groupes de 4, s'il vous plaît. Alors ça, c'est un groupe de 16. I've been looking for her file. She's got very high quality. Do you know that? That's annoying. Hide it. And the fact that she wears the same name as the Minister of the Interior? But it has nothing to do with it. It's her daughter, but it has no relationship. I warn you, on the first occasion, I'll fire her. We have to go get her. No, it's good, look, she's still making bubbles. En casion, je la fiers. Il faut aller la chercher là. Non, c'est beau, regarde, elle fait encore des bulles. Pardon ! Elle a, elle est éliminée, elle a... Ah bah non. L'ennemi est neutralisé là. She's finished. Oh, no. The enemy is neutralized.\",\n",
" 'chunks': [{'text': \" Leur Raid, l'élite de la police, des super-agences sur-entraînées.\",\n",
" 'timestamp': (0.0, 5.0)},\n",
" {'text': ' Leur devise, servir sans faillir.', 'timestamp': (5.0, 8.0)},\n",
" {'text': ' Suivant.', 'timestamp': (8.0, 9.0)},\n",
" {'text': ' Bonjour.', 'timestamp': (9.0, 10.0)},\n",
" {'text': \" Ah, c'est pour le secrétairelien, l'infirmiereau ou la cantine ?\",\n",
" 'timestamp': (10.0, 12.0)},\n",
" {'text': \" Je suis là pour le groupe d'intervention du Raid.\",\n",
" 'timestamp': (12.0, 14.0)},\n",
" {'text': \" C'est un danger pour le groupe, une gonzesse.\",\n",
" 'timestamp': (14.0, 16.0)},\n",
" {'text': ' Une femme froissale.', 'timestamp': (16.0, 17.0)},\n",
" {'text': ' Ça crie, ça chiale, ça se pète les ondes, ça se poince les cheveux dans le casque.',\n",
" 'timestamp': (17.0, 22.0)},\n",
" {'text': \" Bon, on a une femme, c'est comme ça.\", 'timestamp': (22.0, 24.0)},\n",
" {'text': ' Oh non ! Puis ça, tout le temps, envie de pisser.',\n",
" 'timestamp': (24.0, 26.0)},\n",
" {'text': \" On n'a pas les combinaisons adaptées pour les pisseuses.\",\n",
" 'timestamp': (26.0, 28.0)},\n",
" {'text': \" Effectivement, vous n'êtes pas du tout macho.\",\n",
" 'timestamp': (28.0, 30.0)},\n",
" {'text': ' Je me suis trompé.', 'timestamp': (30.0, 31.0)},\n",
" {'text': ' Mais ?', 'timestamp': (31.0, 32.0)},\n",
" {'text': ' Vous êtes 16.', 'timestamp': (32.0, 33.0)},\n",
" {'text': ' Y a 4 lits par chambre.', 'timestamp': (33.0, 35.0)},\n",
" {'text': \" Faites-moi tout de suite 4 groupes de 4, s'il vous plaît.\",\n",
" 'timestamp': (35.0, 37.0)},\n",
" {'text': \" Alors ça, c'est un groupe de 16.\", 'timestamp': (37.0, 42.0)},\n",
" {'text': \" I've been looking for her file. She's got very high quality.\",\n",
" 'timestamp': (42.0, 45.0)},\n",
" {'text': ' Do you know that?', 'timestamp': (45.0, 46.0)},\n",
" {'text': \" That's annoying.\", 'timestamp': (46.0, 47.0)},\n",
" {'text': ' Hide it.', 'timestamp': (47.0, 48.0)},\n",
" {'text': ' And the fact that she wears the same name as the Minister of the Interior?',\n",
" 'timestamp': (48.0, 50.0)},\n",
" {'text': ' But it has nothing to do with it.', 'timestamp': (50.0, 51.0)},\n",
" {'text': \" It's her daughter, but it has no relationship.\",\n",
" 'timestamp': (51.0, 53.0)},\n",
" {'text': \" I warn you, on the first occasion, I'll fire her.\",\n",
" 'timestamp': (53.0, 55.0)},\n",
" {'text': ' We have to go get her.', 'timestamp': (55.0, 56.0)},\n",
" {'text': \" No, it's good, look, she's still making bubbles.\",\n",
" 'timestamp': (56.0, 59.0)},\n",
" {'text': ' En casion, je la fiers.', 'timestamp': (59.0, 60.0)},\n",
" {'text': ' Il faut aller la chercher là.', 'timestamp': (60.0, 61.0)},\n",
" {'text': \" Non, c'est beau, regarde, elle fait encore des bulles.\",\n",
" 'timestamp': (61.0, 63.0)},\n",
" {'text': ' Pardon !', 'timestamp': (63.0, 63.5)},\n",
" {'text': ' Elle a, elle est éliminée, elle a...', 'timestamp': (63.5, 65.0)},\n",
" {'text': ' Ah bah non.', 'timestamp': (65.0, 65.5)},\n",
" {'text': \" L'ennemi est neutralisé là.\", 'timestamp': (65.5, 66.5)},\n",
" {'text': \" She's finished.\", 'timestamp': (66.5, 68.3)},\n",
" {'text': ' Oh, no. The enemy is neutralized.', 'timestamp': (68.3, 70.7)}]}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from bechdelai.audio.speech_recognition import SpeechRecognition\n",
"\n",
"sr = SpeechRecognition()\n",
"sr.transcribe(output_filename, \"fr\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "bechdelai",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "31ffc711ab2ee07bd298f523dc1dd63ebc15cb1e136e0e7de381fff9c93dfdff"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@ authors = ["Théo Alves Da Costa <[email protected]>"]
license = "MIT"

[tool.poetry.dependencies]
python = ">=3.8,<3.10"
python = ">=3.8,<3.11"
jupyter = "^1.0.0"
pandas = "^1.3.4"
numpy = "^1.21.3"