diff --git a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/demo.py b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/demo.py deleted file mode 100644 index e879bf8..0000000 --- a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/demo.py +++ /dev/null @@ -1,36 +0,0 @@ -import torch -from neuralnet.model import neuralnet # Call model instance from neuralnet - -from utils import get_features - -# Load the trained model -model = neuralnet(input_size=1, output_shape=6) -checkpoint = torch.load("model/sentiment-model-19-0.07.ckpt", map_location=torch.device('cpu')) - -# Evaluate model -model.eval() -model.load_state_dict(checkpoint['state_dict']) - -labels = ["angry", "disgust", "fear", "happy", "neutral", "sad"] - -# Perform inference -audio_path = ['sample/' + label + '.wav' for label in labels] # just a sampling loop instead of taking one sample at a time - -for audio in audio_path: - features = get_features(audio) - # print(features.shape, features.dtype) - input_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).unsqueeze(1) # Add extra batch dimension and channel dimension - # print(input_tensor.shape, input_tensor.dtype) - - with torch.inference_mode(): - output = model(input_tensor) - - # Convert output to probabilities and get predicted class - probabilities = torch.softmax(output, dim=1) - predicted_class = torch.argmax(probabilities, dim=1).item() - - # Calculate confidence - confidence = probabilities[0, predicted_class].item() - - # Print predicted class - print(f"Path: {audio}\nPredicted class: {labels[predicted_class]} --- Confidence: {confidence:.3f}\n") \ No newline at end of file diff --git a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/engine.py b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/engine.py new file mode 100644 index 0000000..7c533d4 --- /dev/null +++ b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/engine.py @@ -0,0 +1,64 @@ +import numpy as np +import joblib +import librosa +import torch + +from sklearn.preprocessing import StandardScaler +from neuralnet.model import HybridModel +from feature import getMELspectrogram, splitIntoChunks + + +EMOTIONS = { + 1: 'neutral', + 2: 'calm', + 3: 'happy', + 4: 'sad', + 5: 'angry', + 6: 'fear', + 7: 'disgust', + 0: 'surprise' +} + +scaler = StandardScaler() +model = HybridModel(len(EMOTIONS)) +model.load_state_dict(torch.load("model/speech_sentiment.pt", map_location=torch.device('cpu'))) +SAMPLE_RATE = 48000 +scaler = joblib.load('model/scaler.pkl') + +def process_audio(audio_file_path): + global scaler + chunked_spec = [] + + # Load audio file + audio, sample_rate = librosa.load(audio_file_path, sr=SAMPLE_RATE, duration=3) + signal = np.zeros((int(SAMPLE_RATE * 3),)) + signal[:len(audio)] = audio + mel_spectrogram = getMELspectrogram(signal, SAMPLE_RATE) + chunks = splitIntoChunks(mel_spectrogram, win_size=128, stride=64) + + chunked_spec.append(chunks) + chunks = np.stack(chunked_spec, axis=0) + chunks = np.expand_dims(chunks, axis=2) + + # Reshape the chunks + chunks = np.reshape(chunks, newshape=(1, -1)) + chunks_scaled = scaler.transform(chunks) + chunks_scaled = np.reshape(chunks_scaled, newshape=(1, 7, 1, 128, 128)) + + # Convert to tensor for model input + chunks_tensor = torch.tensor(chunks_scaled).float() + + # Model inference + with torch.inference_mode(): + model.eval() + _, output_softmax, _ = model(chunks_tensor) + predictions = torch.argmax(output_softmax, dim=1) + print(predictions) + predicted_emotion = EMOTIONS[predictions.item()] + + print(f"Predicted Emotion: {predicted_emotion}") + return predicted_emotion + +file_path = "fear.wav" +process_audio(file_path) + diff --git a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/chunk.py b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/feature.py similarity index 100% rename from src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/chunk.py rename to src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/feature.py diff --git a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/main.py b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/main.py deleted file mode 100644 index ebfb4ea..0000000 --- a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/main.py +++ /dev/null @@ -1,121 +0,0 @@ -import librosa -import numpy as np -import torch -from IPython.display import Audio, display -import os -import joblib -from architecture import HybridModel # Import your model architecture -from chunk_and_spectogram import getMELspectrogram, splitIntoChunks # Import the functions - -EMOTIONS = { - 1: 'neutral', - 2: 'calm', - 3: 'happy', - 4: 'sad', - 5: 'angry', - 6: 'fear', - 7: 'disgust', - 0: 'surprise' -} - -# Load your trained model -LOAD_PATH = os.path.join(os.getcwd(), 'models') -model = HybridModel(len(EMOTIONS)) - -# UNCOMMENT THE CODE LINE 1 TO 2 AND COMMENT THE CODE BELOW LINE 3 TO 4 IF YOU PLAN ON RUNNING THE MODEL ON GPU - -# 1 -# Load model weights and move to the appropriate device -# model.load_state_dict(torch.load(os.path.join(LOAD_PATH, '/content/speech_sentiment_asr.pt'))) -# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -# model.to(device) # Move the model to the GPU or keep it on CPU -# print('Model is loaded from {}'.format(os.path.join(LOAD_PATH, 'speech_sentiment_asr.pt'))) -# 2 - -# 3 -# Load model weights and move to the appropriate device (CPU version) -model.load_state_dict(torch.load(os.path.join(LOAD_PATH, '/content/speech_sentiment_asr.pt'), map_location=torch.device('cpu'))) -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -model.to(device) # Move the model to the GPU or keep it on CPU -print('Model is loaded from {}'.format(os.path.join(LOAD_PATH, 'speech_sentiment_asr.pt'))) -# 4 - -SAMPLE_RATE = 48000 -DURATION = 3 -NUM_MEL_BINS = 128 - -# Load your fitted scaler -scaler = joblib.load('/content/scaler.pkl') - -def process_audio(audio_file_path): - """ - Process the audio file, convert to MEL spectrogram, split into chunks, scale and make predictions. - """ - # Load audio file - audio, sample_rate = librosa.load(audio_file_path, sr=SAMPLE_RATE) - - # Ensure the audio length is the desired target length - target_length = SAMPLE_RATE * DURATION - if len(audio) > target_length: - audio = audio[:target_length] - else: - audio = np.pad(audio, (0, target_length - len(audio)), 'constant') - - # Compute MEL spectrogram - mel_spectrogram = getMELspectrogram(audio, SAMPLE_RATE) - print(f"Mel Spectrogram Shape: {mel_spectrogram.shape}") - - # Split into chunks - chunks = splitIntoChunks(mel_spectrogram, win_size=128, stride=64) - print(f"Chunks Shape Before Scaling: {chunks.shape}") - - # Pad or truncate to 7 chunks - num_chunks = chunks.shape[0] - print(f"Number of Chunks: {num_chunks}") - if num_chunks < 7: - padding = np.zeros((7 - num_chunks, 128, 128)) - chunks = np.concatenate((chunks, padding), axis=0) - elif num_chunks > 7: - chunks = chunks[:7] - - # Prepare chunks for model input - chunks = chunks[np.newaxis, :] # Add batch dimension - chunks = np.expand_dims(chunks, axis=1) # Add channel dimension (for CNN) - chunks_reshaped = chunks.reshape(1, 7, 1, 128, 128) - print(f"Chunks Shape After Reshaping: {chunks_reshaped.shape}") - - # Scale the chunks - chunks_scaled = scaler.transform(chunks_reshaped.reshape(1, -1)) - chunks_scaled = chunks_scaled.reshape(1, 7, 1, 128, 128) - print(f"Chunks Shape After Scaling: {chunks_scaled.shape}") - - # Convert to tensor for model input - chunks_tensor = torch.tensor(chunks_scaled, device=device).float() - - # Make predictions with the model - with torch.no_grad(): - model.eval() - _, output_softmax, _ = model(chunks_tensor) - predictions = torch.argmax(output_softmax, dim=1) - predicted_emotion = EMOTIONS[predictions.item()] - - # Display the audio - display(Audio(audio_file_path)) - - # Print the predicted emotion - print(f"Predicted Emotion: {predicted_emotion}") - - return predicted_emotion - -# Take input audio file from user - -print("NOTE: IT YOU HAVE MP3 FILE, THEN PLEASE RUN MP3_TO_WAV.PY SCRIPT TO CONVERT MP3 TO WAV FIRST, THEN ONLY RUN TIS SCRIPT") - -print("\n") -print("\n") -print("\n") - -file_path = input("Enter the path to your .wav file: ") - -# Process the audio and predict emotion -process_audio(file_path) diff --git a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/utils.py b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/utils.py deleted file mode 100644 index ddd77a5..0000000 --- a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/Speech_Sentiment_Analysis/utils.py +++ /dev/null @@ -1,42 +0,0 @@ -""" Function Script for preprocessing audio data and extract features """ - -import librosa -import numpy as np - - -# Zero Crossing Rate -# Reference: https://librosa.org/doc/latest/generated/librosa.feature.zero_crossing_rate.html#librosa.feature.zero_crossing_rate -def zcr(data, frame_length=2048, hop_length=512): - zcr = librosa.feature.zero_crossing_rate(y=data, frame_length=frame_length, hop_length=hop_length) - return np.squeeze(zcr) - -# RMS Energy -# Reference: https://librosa.org/doc/latest/generated/librosa.feature.rms.html#librosa.feature.rms -def rmse(data, frame_length=2048, hop_length=512): - rmse = librosa.feature.rms(y=data, frame_length=frame_length, hop_length=hop_length) - return np.squeeze(rmse) - -# MFCC -# Reference: https://librosa.org/doc/latest/generated/librosa.feature.mfcc.html -def mfcc(data, sr, frame_length=2048, hop_length=512, flatten: bool = True): - mfcc_feature = librosa.feature.mfcc(y=data, sr=sr) - return np.squeeze(mfcc_feature.T) if not flatten else np.ravel(mfcc_feature.T) - -# Feature Extraction of ZCR, RMS, MFCC -def extract_features(data, sr, frame_length=2048, hop_length=512): - result = np.array([]) - result = np.hstack((result, - zcr(data, frame_length, hop_length), - rmse(data, frame_length, hop_length), - mfcc(data, sr, frame_length, hop_length) - )) - return result - - -""" Original """ -def get_features(path): - data, sampling_rate = librosa.load(path, duration=2.5, offset=0.6) - # print('Data and sampling rate:', data.shape, sampling_rate) - result = extract_features(data, sampling_rate) - # print(result.shape) - return result