docs: adding PESQ example to gallery

Lightning-AI · Sep 18, 2024 · b42f3bb · b42f3bb
1 parent fd11f33
commit b42f3bb
Show file tree

Hide file tree

Showing 3 changed files with 125 additions and 5 deletions.
diff --git a/examples/audio/pesq.py b/examples/audio/pesq.py
@@ -0,0 +1,113 @@
+"""
+PESQ Metric Calculation for Speech Enhancement
+==============================================
+
+In this notebook, we will calculate the Perceptual Evaluation of Speech Quality (PESQ) score
+to assess the improvement in speech quality after applying a basic noise reduction technique.
+
+PESQ is widely used in speech enhancement, telecommunications, and VoIP to evaluate the
+perceived quality of speech signals.
+"""
+
+#%%
+# Import necessary libraries
+import torch
+import torchaudio
+from torchmetrics.audio import PerceptualEvaluationSpeechQuality
+import matplotlib.pyplot as plt
+import numpy as np
+
+#%%
+# Generate Synthetic Clean and Noisy Audio Signals
+# We'll generate a clean sine wave (representing a clean speech signal) and add white noise to simulate the noisy version.
+
+def generate_sine_wave(frequency, duration, sample_rate, amplitude=0.5):
+    """Generate a clean sine wave at a given frequency."""
+    t = torch.linspace(0, duration, int(sample_rate * duration))
+    waveform = amplitude * torch.sin(2 * np.pi * frequency * t)
+    return waveform
+
+def add_noise(waveform, noise_factor=0.05):
+    """Add white noise to a waveform."""
+    noise = noise_factor * torch.randn(waveform.size())
+    noisy_waveform = waveform + noise
+    return noisy_waveform
+
+# Parameters for the synthetic audio
+sample_rate = 16000  # 16 kHz typical for speech
+duration = 3  # 3 seconds of audio
+frequency = 440  # A4 note, can represent a simple speech-like tone
+
+# Generate the clean sine wave
+clean_waveform = generate_sine_wave(frequency, duration, sample_rate)
+
+# Generate the noisy waveform by adding white noise
+noisy_waveform = add_noise(clean_waveform)
+
+
+#%%
+# Apply Basic Noise Reduction Technique
+# In this step, we apply a simple spectral gating method for noise reduction using torchaudio's
+# `spectrogram` method. This is to simulate the enhancement of noisy speech.
+
+def reduce_noise(noisy_signal, sample_rate, threshold=0.2):
+    """Basic noise reduction using spectral gating."""
+    # Compute the spectrogram
+    spec = torchaudio.transforms.Spectrogram()(noisy_signal)
+
+    # Apply threshold-based gating: values below the threshold will be zeroed out
+    spec_denoised = spec * (spec > threshold)
+
+    # Convert back to the waveform
+    inverse_spec = torchaudio.transforms.GriffinLim()(spec_denoised)
+    return inverse_spec
+
+
+# Apply noise reduction to the noisy waveform
+enhanced_waveform = reduce_noise(noisy_waveform, sample_rate)
+
+#%%
+# Initialize the PESQ Metric
+# PESQ can be computed in two modes: 'wb' (wideband) or 'nb' (narrowband).
+# Here, we are using 'wb' mode for wideband speech quality evaluation.
+pesq_metric = PerceptualEvaluationSpeechQuality(fs=sample_rate, mode='wb')
+
+#%%
+# Compute PESQ Scores
+# We will calculate the PESQ scores for both the noisy and enhanced versions
+# compared to the clean signal.
+# The PESQ scores give us a numerical evaluation of how well the enhanced speech
+# compares to the clean speech. Higher scores indicate better quality.
+
+pesq_noisy = pesq_metric(clean_waveform, noisy_waveform)
+pesq_enhanced = pesq_metric(clean_waveform, enhanced_waveform)
+
+print(f"PESQ Score for Noisy Audio: {pesq_noisy.item():.4f}")
+print(f"PESQ Score for Enhanced Audio: {pesq_enhanced.item():.4f}")
+
+#%%
+# Visualize the waveforms
+# We can visualize the waveforms of the clean, noisy, and enhanced audio to see the differences.
+fig, axs = plt.subplots(3, 1, figsize=(12, 9))
+
+# Plot clean waveform
+axs[0].plot(clean_waveform.numpy())
+axs[0].set_title("Clean Audio Waveform (Sine Wave)")
+axs[0].set_xlabel("Time")
+axs[0].set_ylabel("Amplitude")
+
+# Plot noisy waveform
+axs[1].plot(noisy_waveform.numpy(), color='orange')
+axs[1].set_title(f"Noisy Audio Waveform (PESQ: {pesq_noisy.item():.4f})")
+axs[1].set_xlabel("Time")
+axs[1].set_ylabel("Amplitude")
+
+# Plot enhanced waveform
+axs[2].plot(enhanced_waveform.numpy(), color='green')
+axs[2].set_title(f"Enhanced Audio Waveform (PESQ: {pesq_enhanced.item():.4f})")
+axs[2].set_xlabel("Time")
+axs[2].set_ylabel("Amplitude")
+
+# Adjust layout for better visualization
+fig.tight_layout()
+plt.show()
diff --git a/examples/audio/signal_to_noise_ratio.py b/examples/audio/signal_to_noise_ratio.py
@@ -16,13 +16,10 @@
 import torch
 from torchmetrics.audio import SignalNoiseRatio
 
-# Set seed for reproducibility
-torch.manual_seed(42)
-np.random.seed(42)
-
-
 # %%
 # Generate a clean signal (simulating a high-quality recording)
+
+
 def generate_clean_signal(length: int = 1000) -> Tuple[np.ndarray, np.ndarray]:
     """Generate a clean signal (sine wave)"""
     t = np.linspace(0, 1, length)
@@ -32,6 +29,8 @@ def generate_clean_signal(length: int = 1000) -> Tuple[np.ndarray, np.ndarray]:
 
 # %%
 # Add Gaussian noise to the signal to simulate the noisy environment
+
+
 def add_noise(signal: np.ndarray, noise_level: float = 0.5) -> np.ndarray:
     """Add Gaussian noise to the signal."""
     noise = noise_level * np.random.randn(signal.shape[0])
@@ -40,6 +39,8 @@ def add_noise(signal: np.ndarray, noise_level: float = 0.5) -> np.ndarray:
 
 # %%
 # Apply FFT to filter out the noise
+
+
 def fft_denoise(noisy_signal: np.ndarray, threshold: float) -> np.ndarray:
     """Denoise the signal using FFT."""
     freq_domain = np.fft.fft(noisy_signal)  # Filter frequencies using FFT
@@ -50,6 +51,7 @@ def fft_denoise(noisy_signal: np.ndarray, threshold: float) -> np.ndarray:
 
 # %%
 # Generate and plot clean, noisy, and denoised signals to visualize the reconstruction
+
 length = 1000
 t, clean_signal = generate_clean_signal(length)
 noisy_signal = add_noise(clean_signal, noise_level=0.5)

diff --git a/examples/image/clip_score.py b/examples/image/clip_score.py
@@ -19,6 +19,7 @@
 
 # %%
 # Get sample images
+
 images = {
     "astronaut": astronaut(),
     "cat": cat(),
@@ -27,6 +28,7 @@
 
 # %%
 # Define a hypothetical captions for the images
+
 captions = [
     "A photo of an astronaut.",
     "A photo of a cat.",
@@ -35,6 +37,7 @@
 
 # %%
 # Define the models for CLIPScore
+
 models = [
     "openai/clip-vit-base-patch16",
     # "openai/clip-vit-base-patch32",
@@ -44,6 +47,7 @@
 
 # %%
 # Collect scores for each image-caption pair
+
 score_results = []
 for model in models:
     clip_score = CLIPScore(model_name_or_path=model)
@@ -54,6 +58,7 @@
 
 # %%
 # Create an animation to display the scores
+
 fig, (ax_img, ax_table) = plt.subplots(1, 2, figsize=(10, 5))