Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Replace torchaudio with pydub #381

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
feat: Replace torchaudio with pydub
refactor: Removed unnecessary dependencies

Removed Requirements
- python-dateutil
- tiktoken
- torchaudio
- scipy
- tokenizers
- huggingface-hub
- sentence-transformers
- optimum[onnxruntime]

Major Changes in This Commit
- torchaudio to pydub
	- bolna/helpers/utils.py
		- save_audio_file_to_s3
		- resample
		- pcm_to_wav_bytes
		- wav_bytes_to_pcm
	- bolna/synthesizer/basesynthesizer
		- resample
- sklearn to np
	- bolna/memory/cache/vector_cache
		- __get_top_cosine_similarity_doc
h3110Fr13nd committed Aug 24, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 1a7bc469d376951c198695e292e45b23b36d4e2e
3 changes: 1 addition & 2 deletions bolna/helpers/analytics_helpers.py
Original file line number Diff line number Diff line change
@@ -2,7 +2,6 @@
import os
from datetime import datetime, timezone
from dotenv import load_dotenv
from dateutil import parser
import copy
from .utils import format_messages
from .logger_config import configure_logger
@@ -80,7 +79,7 @@ def update_execution_details(current_high_level_assistant_analytics_data, run_de

def update_historical_values(arr, current_run_val, last_updated_at, should_increment, multiplier = 0, interval_minutes=1440):
now = datetime.now(timezone.utc)
last_updated_datetime = parser.isoparse(last_updated_at)
last_updated_datetime = datetime.fromisoformat(last_updated_at)
difference_in_minutes = (now - last_updated_datetime).total_seconds() / 60

if not arr or len(arr) == 0:
113 changes: 50 additions & 63 deletions bolna/helpers/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import datetime
import json
import asyncio
import math
import re
import copy
import hashlib
@@ -11,9 +10,6 @@
import wave
import numpy as np
import aiofiles
import torch
import torchaudio
from scipy.io import wavfile
from botocore.exceptions import BotoCoreError, ClientError
from aiobotocore.session import AioSession
from contextlib import AsyncExitStack
@@ -90,12 +86,9 @@ def float32_to_int16(float_audio):

def wav_bytes_to_pcm(wav_bytes):
wav_buffer = io.BytesIO(wav_bytes)
rate, data = wavfile.read(wav_buffer)
if data.dtype == np.int16:
return data.tobytes()
if data.dtype == np.float32:
data = float32_to_int16(data)
return data.tobytes()
audio_segment = AudioSegment.from_file(wav_buffer, format="wav")
pcm_data = audio_segment.raw_data
return pcm_data


# def wav_bytes_to_pcm(wav_bytes):
@@ -337,15 +330,18 @@ def yield_chunks_from_memory(audio_bytes, chunk_size=512):
yield audio_bytes[i:i + chunk_size]


def pcm_to_wav_bytes(pcm_data, sample_rate = 16000, num_channels = 1, sample_width = 2):
buffer = io.BytesIO()
bit_depth = 16
if len(pcm_data)%2 == 1:
def pcm_to_wav_bytes(pcm_data, sample_rate=16000, num_channels=1, sample_width=2):
if len(pcm_data) % 2 == 1:
pcm_data += b'\x00'
tensor_pcm = torch.frombuffer(pcm_data, dtype=torch.int16)
tensor_pcm = tensor_pcm.float() / (2**(bit_depth - 1))
tensor_pcm = tensor_pcm.unsqueeze(0)
torchaudio.save(buffer, tensor_pcm, sample_rate, format='wav')
audio_segment = AudioSegment(
data=pcm_data,
sample_width=sample_width,
frame_rate=sample_rate,
channels=num_channels
)
buffer = io.BytesIO()
audio_segment.export(buffer, format="wav")

return buffer.getvalue()


@@ -359,16 +355,16 @@ def convert_audio_to_wav(audio_bytes, source_format = 'flac'):
return buffer.getvalue()


def resample(audio_bytes, target_sample_rate, format = "mp3"):
def resample(audio_bytes, target_sample_rate, format="mp3"):
audio_buffer = io.BytesIO(audio_bytes)
waveform, orig_sample_rate = torchaudio.load(audio_buffer, format = format)
audio_segment = AudioSegment.from_file(audio_buffer, format=format)
orig_sample_rate = audio_segment.frame_rate
if orig_sample_rate == target_sample_rate:
return audio_bytes
resampler = torchaudio.transforms.Resample(orig_sample_rate, target_sample_rate)
audio_waveform = resampler(waveform)
audio_buffer = io.BytesIO()
logger.info(f"Resampling from {orig_sample_rate} to {target_sample_rate}")
torchaudio.save(audio_buffer, audio_waveform, target_sample_rate, format="wav")
resampled_audio = audio_segment.set_frame_rate(target_sample_rate)
audio_buffer = io.BytesIO()
resampled_audio.export(audio_buffer, format="wav")
return audio_buffer.getvalue()


@@ -450,61 +446,52 @@ async def write_request_logs(message, run_id):
else:
await log_file.write(log_string)

async def save_audio_file_to_s3(conversation_recording, sampling_rate = 24000, assistant_id = None, run_id = None):
async def save_audio_file_to_s3(conversation_recording, sampling_rate=24000, assistant_id=None, run_id=None):
last_frame_end_time = conversation_recording['output'][0]['start_time']
logger.info(f"LENGTH OF OUTPUT AUDIO {len(conversation_recording['output'])}")
initial_gap = (last_frame_end_time - conversation_recording["metadata"]["started"] ) *1000
initial_gap = (last_frame_end_time - conversation_recording["metadata"]["started"]) * 1000
logger.info(f"Initial gap {initial_gap}")

combined_audio = AudioSegment.silent(duration=initial_gap, frame_rate=sampling_rate)

for i, frame in enumerate(conversation_recording['output']):
frame_start_time = frame['start_time']
logger.info(f"Processing frame {i}, fram start time = {last_frame_end_time}, frame start time= {frame_start_time}")
frame_start_time = frame['start_time']
logger.info(f"Processing frame {i}, frame start time = {last_frame_end_time}, frame start time = {frame_start_time}")

if last_frame_end_time < frame_start_time:
gap_duration_samples = frame_start_time - last_frame_end_time
silence = AudioSegment.silent(duration=gap_duration_samples*1000, frame_rate=sampling_rate)
silence = AudioSegment.silent(duration=gap_duration_samples * 1000, frame_rate=sampling_rate)
combined_audio += silence

last_frame_end_time = frame_start_time + frame['duration']
frame_as = AudioSegment.from_file(io.BytesIO(frame['data']), format = "wav")
combined_audio +=frame_as

webm_segment = AudioSegment.from_file(io.BytesIO(conversation_recording['input']["data"]))
wav_bytes = io.BytesIO()
webm_segment.export(wav_bytes, format="wav")
wav_bytes.seek(0) # Reset the pointer to the start
waveform, sample_rate = torchaudio.load(wav_bytes)
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=sampling_rate)
downsampled_waveform = resampler(waveform)
torchaudio_wavio = io.BytesIO()
torchaudio.save(torchaudio_wavio, downsampled_waveform, sampling_rate, format= "wav")
frame_as = AudioSegment.from_file(io.BytesIO(frame['data']), format="wav")
combined_audio += frame_as

webm_segment = AudioSegment.from_file(io.BytesIO(conversation_recording['input']["data"]), format="webm")
webm_segment = webm_segment.set_frame_rate(sampling_rate)

audio_segment_bytes = io.BytesIO()
combined_audio = combined_audio.set_frame_rate(sampling_rate)
combined_audio.export(audio_segment_bytes, format="wav")
audio_segment_bytes.seek(0)
waveform_audio_segment, sample_rate = torchaudio.load(audio_segment_bytes)

if waveform_audio_segment.shape[0] > 1:
waveform_audio_segment = waveform_audio_segment[:1, :]

# Adjust shapes to be [1, N] if not already
downsampled_waveform = downsampled_waveform.unsqueeze(0) if downsampled_waveform.dim() == 1 else downsampled_waveform
waveform_audio_segment = waveform_audio_segment.unsqueeze(0) if waveform_audio_segment.dim() == 1 else waveform_audio_segment

# Ensure both waveforms have the same length
max_length = max(downsampled_waveform.size(1), waveform_audio_segment.size(1))
downsampled_waveform_padded = torch.nn.functional.pad(downsampled_waveform, (0, max_length - downsampled_waveform.size(1)))
waveform_audio_segment_padded = torch.nn.functional.pad(waveform_audio_segment, (0, max_length - waveform_audio_segment.size(1)))
stereo_waveform = torch.cat((downsampled_waveform_padded, waveform_audio_segment_padded), 0)

# Verify the stereo waveform shape is [2, M]
assert stereo_waveform.shape[0] == 2, "Stereo waveform should have 2 channels."
key = f'{assistant_id + run_id.split("#")[1]}.wav'


combined_audio_segment = AudioSegment.from_file(audio_segment_bytes, format="wav")
combined_audio_segment = combined_audio_segment.set_channels(1)

if len(webm_segment) > len(combined_audio_segment):
combined_audio_segment = combined_audio_segment + AudioSegment.silent(duration=len(webm_segment) - len(combined_audio_segment))
elif len(webm_segment) < len(combined_audio_segment):
webm_segment = webm_segment + AudioSegment.silent(duration=len(combined_audio_segment) - len(webm_segment))
webm_segment = webm_segment.set_channels(1)
combined_audio_segment = combined_audio_segment.set_channels(1)
stereo_audio_segment = webm_segment.overlay(combined_audio_segment)
audio_buffer = io.BytesIO()
torchaudio.save(audio_buffer, stereo_waveform, 24000, format="wav")
stereo_audio_segment.export(audio_buffer, format="wav")
audio_buffer.seek(0)

key = f'{assistant_id + run_id.split("#")[1]}.wav'
logger.info(f"Storing in {RECORDING_BUCKET_URL}{key}")
await store_file(bucket_name=RECORDING_BUCKET_NAME, file_key=key, file_data=audio_buffer, content_type="wav")

await store_file(bucket_name=RECORDING_BUCKET_NAME, file_key=key, file_data=audio_buffer, content_type="audio/wav")
return f'{RECORDING_BUCKET_URL}{key}'

def list_number_of_wav_files_in_directory(directory):
1 change: 0 additions & 1 deletion bolna/helpers/vad.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import subprocess
import requests
import torch
import numpy as np
11 changes: 3 additions & 8 deletions bolna/memory/cache/vector_cache.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@

from bolna.helpers.logger_config import configure_logger
from bolna.memory.cache.base_cache import BaseCache
from typing import List
import numpy as np
from fastembed import TextEmbedding
from sentence_transformers import util
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

logger = configure_logger(__name__)

@@ -23,11 +20,9 @@ def set(self, documents):
)

def __get_top_cosine_similarity_doc(self, query_embedding):
#util.pytorch_cos_sim(self.embeddings, query_embedding)
# scores = np.dot(self.embeddings, query_embedding)
# sorted_scores = np.argsort(scores)[::-1]

similarities = cosine_similarity([query_embedding], self.embeddings)[0]
query_norm = query_embedding / np.linalg.norm(query_embedding)
embeddings_norm = self.embeddings / np.linalg.norm(self.embeddings, axis=1)[:, np.newaxis]
similarities = np.dot(embeddings_norm, query_norm)
most_similar_index = np.argmax(similarities)
return self.documents[most_similar_index]

2 changes: 1 addition & 1 deletion bolna/synthesizer/azure_synthesizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from dotenv import load_dotenv
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, pcm_to_wav_bytes, resample, wav_bytes_to_pcm
from bolna.helpers.utils import create_ws_data_packet, wav_bytes_to_pcm
from bolna.memory.cache.inmemory_scalar_cache import InmemoryScalarCache
from .base_synthesizer import BaseSynthesizer
import azure.cognitiveservices.speech as speechsdk
11 changes: 5 additions & 6 deletions bolna/synthesizer/base_synthesizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import io
import torchaudio
from bolna.helpers.logger_config import configure_logger
import asyncio
from pydub import AudioSegment

logger = configure_logger(__name__)

@@ -29,12 +29,11 @@ def get_synthesized_characters(self):
return 0

def resample(self, audio_bytes):
audio_buffer = io.BytesIO(audio_bytes)
waveform, orig_sample_rate = torchaudio.load(audio_buffer)
resampler = torchaudio.transforms.Resample(orig_sample_rate, 8000)
audio_waveform = resampler(waveform)
audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes))
audio_segment = audio_segment.set_frame_rate(8000)
audio_segment = audio_segment.set_channels(1)
audio_buffer = io.BytesIO()
torchaudio.save(audio_buffer, audio_waveform, 8000, format="wav")
audio_segment.export(audio_buffer, format="wav")
audio_buffer.seek(0)
audio_data = audio_buffer.read()
return audio_data
2 changes: 1 addition & 1 deletion bolna/synthesizer/elevenlabs_synthesizer.py
Original file line number Diff line number Diff line change
@@ -11,7 +11,7 @@
from bolna.memory.cache.inmemory_scalar_cache import InmemoryScalarCache
from .base_synthesizer import BaseSynthesizer
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, pcm_to_wav_bytes, resample
from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, resample


logger = configure_logger(__name__)
2 changes: 1 addition & 1 deletion bolna/synthesizer/melo_synthesizer.py
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@
import os
from dotenv import load_dotenv
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import create_ws_data_packet, resample, wav_bytes_to_pcm
from bolna.helpers.utils import create_ws_data_packet, wav_bytes_to_pcm
from bolna.memory.cache.inmemory_scalar_cache import InmemoryScalarCache
from .base_synthesizer import BaseSynthesizer
import json
2 changes: 1 addition & 1 deletion bolna/synthesizer/openai_synthesizer.py
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@
import os
from dotenv import load_dotenv
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, pcm_to_wav_bytes, resample
from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, resample
from .base_synthesizer import BaseSynthesizer
from openai import AsyncOpenAI
import io
4 changes: 2 additions & 2 deletions bolna/synthesizer/polly_synthesizer.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
from aiobotocore.session import AioSession
from contextlib import AsyncExitStack
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, pcm_to_wav_bytes, resample
from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet
from bolna.memory.cache.inmemory_scalar_cache import InmemoryScalarCache
from .base_synthesizer import BaseSynthesizer

@@ -14,7 +14,7 @@

class PollySynthesizer(BaseSynthesizer):
def __init__(self, voice, language, audio_format="pcm", sampling_rate=8000, stream=False, engine="neural",
buffer_size=400, speaking_rate = "100%", volume = "0dB", caching= True, **kwargs):
buffer_size=400, speaking_rate="100%", volume="0dB", caching=True, **kwargs):
super().__init__(stream, buffer_size)
self.engine = engine
self.format = self.get_format(audio_format.lower())
4 changes: 0 additions & 4 deletions bolna/transcriber/bodhi_transcriber.py
Original file line number Diff line number Diff line change
@@ -2,8 +2,6 @@
from audioop import ulaw2lin
import traceback
import uuid
import numpy as np
import torch
import websockets
import os
import json
@@ -14,9 +12,7 @@
from .base_transcriber import BaseTranscriber
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import create_ws_data_packet
import ssl

torch.set_num_threads(1)

logger = configure_logger(__name__)
load_dotenv()
3 changes: 0 additions & 3 deletions bolna/transcriber/deepgram_transcriber.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import asyncio
import traceback
import numpy as np
import torch
import websockets
import os
import json
@@ -13,8 +12,6 @@
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import create_ws_data_packet

torch.set_num_threads(1)

logger = configure_logger(__name__)
load_dotenv()

9 changes: 1 addition & 8 deletions bolna/transcriber/whisper_transcriber.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,19 @@
import asyncio
# from asyncio.base_tasks import tasks
import traceback
import numpy as np
import torch
import websockets
import os
import json
import time
from .base_transcriber import BaseTranscriber
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import create_ws_data_packet, int2float
from bolna.helpers.vad import VAD
from bolna.helpers.utils import create_ws_data_packet
from audioop import ulaw2lin, ratecv
import json
import os
import time
from queue import Queue
from websockets.exceptions import *

import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
torch.set_num_threads(1)

logger = configure_logger(__name__)

12 changes: 2 additions & 10 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
aiobotocore==2.9.0
aiofiles==23.2.1
aiohttp==3.9.1
aiohttp==3.9.5
azure-cognitiveservices-speech==1.38.0
daily-python==0.9.1
fastapi==0.108.0
@@ -10,20 +10,12 @@ numpy==1.26.1
openai>=1.10.0
pydantic==2.5.3
pydub==0.25.1
python-dateutil==2.8.2
python-dotenv==1.0.0
redis==5.0.1
requests==2.31.0
tiktoken>=0.6.0
torchaudio==2.0.1
twilio==8.9.0
uvicorn==0.22.0
websockets==10.4
onnxruntime>=1.16.3
scipy==1.11.4
uvloop==0.19.0
tokenizers==0.15.2
huggingface-hub==0.20.1
semantic-router==0.0.46
sentence-transformers==3.0.1
optimum[onnxruntime]
semantic-router==0.0.58