-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_recognition.py
105 lines (84 loc) · 4.36 KB
/
word_recognition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import numpy as np
from typing import Tuple, Optional, Dict, List
from feature_extraction import extract_features
from utils import load_speakers_from_folder
from dtw import calculate_dtw_cost
def recognize_word(input_file: str, sound_database: str, mel_cost_threshold: float = 0.2, confidence_threshold: float = 0.8,
input_speaker_name: str = '') -> Tuple[Optional[str], str, Dict[str, List[Tuple[str, float]]]]:
"""
Recognize a word from an input audio file by comparing it with a sound database.
Args:
input_file (str): Path to the input audio file.
sound_database (str): Path to the folder containing speaker audio files.
mel_cost_threshold (float): Threshold for Mel spectrogram cost to classify
a word as unknown. Default is 0.2.
confidence_threshold (float): Minimum confidence required to classify a word.
Default is 0.8.
input_speaker_name (str): Name of the input speaker to exclude from matching.
Default is an empty string.
Returns:
tuple:
- final_word (Optional[str]): The recognized word or None if confidence is below
the threshold or Mel cost exceeds the threshold.
- predicted_word (str): The final predicted word, regardless of confidence.
- sorted_dtw_results (dict): Sorted DTW results for each feature type (key)
containing a list of (word, normalized_cost) tuples.
"""
feature_types = ['mfcc', 'lpc', 'mel']
weights = {'mfcc': 2, 'mel': 2, 'lpc': 1}
# Load speaker files and initialize variables
speaker_files = load_speakers_from_folder(sound_database)
vocab = np.unique([x.split('-')[0] for x in os.listdir(sound_database)])
word_costs = {feature: {word: [] for word in vocab} for feature in feature_types}
# Calculate DTW costs for each feature type
for feature_type in feature_types:
input_features = extract_features(input_file, feature_type)
for speaker in speaker_files:
if input_speaker_name == speaker:
continue
for word in speaker_files[speaker]:
if word not in vocab:
continue
file_path = speaker_files[speaker][word]
ref_features = extract_features(file_path, feature_type)
# Calculate DTW cost
cost = calculate_dtw_cost(input_features, ref_features)
word_costs[feature_type][word].append(cost)
# Calculate mean normalized costs per word for each feature type
mean_costs = {feature: {} for feature in feature_types}
for feature_type in feature_types:
for word in vocab:
costs = word_costs[feature_type][word]
mean_costs[feature_type][word] = np.mean(costs) if costs else np.inf
# Sort mean costs for each feature type
sorted_dtw_results = {
feature: sorted(mean_costs[feature].items(), key=lambda x: x[1])
for feature in feature_types
}
# Generate predictions for each feature type
predictions = {}
feature_votes = []
for feature_type in feature_types:
best_word = min(mean_costs[feature_type], key=mean_costs[feature_type].get)
predictions[feature_type] = best_word
feature_votes.append(feature_type)
# Weighted voting
word_scores = {word: 0 for word in vocab}
for feature_type, predicted_word in predictions.items():
if predicted_word:
word_scores[predicted_word] += weights[feature_type]
# Final word selection based on scores
final_word = max(word_scores, key=word_scores.get)
total_score = sum(word_scores.values())
confidence = word_scores[final_word] / total_score if total_score > 0 else 0
# Confidence thresholding
if confidence < confidence_threshold:
return None, final_word, sorted_dtw_results
# Check if Mel cost exceeds the threshold
mel_best_word = predictions['mel']
if mel_best_word:
mel_best_cost = mean_costs['mel'][mel_best_word]
if mel_best_cost > mel_cost_threshold:
return None, final_word, sorted_dtw_results
return final_word, final_word, sorted_dtw_results