From 51836c5a0b025e1b03c356c7e1f48e9872ae5037 Mon Sep 17 00:00:00 2001 From: ericguizzo Date: Wed, 29 Sep 2021 16:22:21 +0200 Subject: [PATCH] cog-ified --- README.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- cog.yaml | 18 ++++++++++++++++++ predict.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 cog.yaml create mode 100644 predict.py diff --git a/README.md b/README.md index b074a629..fa8078bc 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # Speech Emotion Recognition ## Introduction + + + - This repository handles building and training Speech Emotion Recognition System. - The basic idea behind this tool is to build and train/test a suited machine learning ( as well as deep learning ) algorithm that could recognize and detects human emotions from speech. - This is useful for many industry fields such as making product recommendations, affective computing, etc. @@ -7,6 +10,7 @@ ## Requirements - **Python 3.6+** ### Python Packages +- **tensorflow** - **librosa==0.6.3** - **numpy** - **pandas** @@ -38,7 +42,7 @@ Feature extraction is the main part of the speech emotion recognition system. It In this repository, we have used the most used features that are available in [librosa](https://github.com/librosa/librosa) library including: - [MFCC](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum) -- Chromagram +- Chromagram - MEL Spectrogram Frequency (mel) - Contrast - Tonnetz (tonal centroid features) @@ -102,6 +106,7 @@ print("Prediction:", rec.predict("data/tess_ravdess/validation/Actor_25/25_01_01 Prediction: neutral Prediction: sad ``` +You can pass any audio file, if it's not in the appropriate format (16000Hz and mono channel), then it'll be automatically converted, make sure you have `ffmpeg` installed in your system and added to *PATH*. ## Example 2: Using RNNs for 5 Emotions ```python from deep_emotion_recognition import DeepEmotionRecognizer @@ -143,6 +148,45 @@ true_neutral 3.846154 8.974360 82.051285 2.564103 true_ps 2.564103 0.000000 1.282051 83.333328 12.820514 true_happy 20.512821 2.564103 2.564103 2.564103 71.794876 ``` +## Example 3: Not Passing any Model and Removing the Custom Dataset +Below code initializes `EmotionRecognizer` with 3 chosen emotions while removing Custom dataset, and setting `balance` to `False`: +```python +from emotion_recognition import EmotionRecognizer +# initialize instance, this will take a bit the first time executed +# as it'll extract the features and calls determine_best_model() automatically +# to load the best performing model on the picked dataset +rec = EmotionRecognizer(emotions=["angry", "neutral", "sad"], balance=False, verbose=1, custom_db=False) +# it will be trained, so no need to train this time +# get the accuracy on the test set +print(rec.confusion_matrix()) +# predict angry audio sample +prediction = rec.predict('data/validation/Actor_10/03-02-05-02-02-02-10_angry.wav') +print(f"Prediction: {prediction}") +``` +**Output:** +``` +[+] Best model determined: RandomForestClassifier with 93.454% test accuracy + + predicted_angry predicted_neutral predicted_sad +true_angry 98.275864 1.149425 0.574713 +true_neutral 0.917431 88.073395 11.009174 +true_sad 6.250000 1.875000 91.875000 + +Prediction: angry +``` +You can print the number of samples on each class: +```python +rec.get_samples_by_class() +``` +**Output:** +``` + train test total +angry 910 174 1084 +neutral 650 109 759 +sad 862 160 1022 +total 2422 443 2865 +``` +In this case, the dataset is only from TESS and RAVDESS, and not balanced, you can pass `True` to `balance` on the `EmotionRecognizer` instance to balance the data. ## Algorithms Used This repository can be used to build machine learning classifiers as well as regressors for the case of 3 emotions {'sad': 0, 'neutral': 1, 'happy': 2} and the case of 5 emotions {'angry': 1, 'sad': 2, 'neutral': 3, 'ps': 4, 'happy': 5} ### Classifiers @@ -207,4 +251,4 @@ plot_histograms(classifiers=True) **Output:** -

A Histogram shows different algorithms metric results on different data sizes as well as time consumed to train/predict.

\ No newline at end of file +

A Histogram shows different algorithms metric results on different data sizes as well as time consumed to train/predict.

diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 00000000..ee4fbd9d --- /dev/null +++ b/cog.yaml @@ -0,0 +1,18 @@ +build: + python_version: "3.6" + gpu: false + python_packages: + - pandas==1.1.5 + - numpy==1.17.3 + - wave==0.0.2 + - sklearn==0.0 + - librosa==0.6.3 + - soundfile==0.9.0 + - tqdm==4.28.1 + - matplotlib==2.2.3 + - pyaudio==0.2.11 + - numba==0.48 + system_packages: + - "ffmpeg" + - "portaudio19-dev" +predict: "predict.py:EmoPredictor" diff --git a/predict.py b/predict.py new file mode 100644 index 00000000..e80fbbeb --- /dev/null +++ b/predict.py @@ -0,0 +1,28 @@ +import json +import os +import tempfile +from pathlib import Path + +import cog +from emotion_recognition import EmotionRecognizer + + +class EmoPredictor(cog.Predictor): + def setup(self): + """Load the emotion recognition model and (quickly) train it""" + # self.rec = EmotionRecognizer(None, emotions=["boredom", "neutral"], features=["mfcc"]) + self.rec = EmotionRecognizer( + None, + emotions=["sad", "neutral", "happy"], + features=["mfcc"], + probability=True, + ) + # evaluate all models in `grid` folder and determine the best one in terms of test accuracy + self.rec.determine_best_model() + + @cog.input("input", type=Path, help="Speech audio file") + def predict(self, input): + """Compute emotion prediction""" + prediction = self.rec.predict_proba(str(input)) + + return prediction