diff --git a/runner/app/pipelines/audio_to_text.py b/runner/app/pipelines/audio_to_text.py index 0ffd546c..ca50b837 100644 --- a/runner/app/pipelines/audio_to_text.py +++ b/runner/app/pipelines/audio_to_text.py @@ -120,13 +120,11 @@ def __init__(self, model_id: str): def __call__(self, audio: UploadFile, duration: float, **kwargs) -> List[File]: audioBytes = audio.file.read() - - # Convert M4A/MP4 files for pipeline compatibility. - if ( - os.path.splitext(audio.filename)[1].lower().lstrip(".") - in INCOMPATIBLE_EXTENSIONS - ): - audioBytes = self._audio_converter.convert(audioBytes, "mp3") + #re-encode audio to match pre-processing done in transformers. + # pipeline accepts np.ndarray and does not convert it again. String file path and bytes are converted to np.ndarray in the pipeline. + #https://github.com/huggingface/transformers/blob/47c29ccfaf56947d845971a439cbe75a764b63d7/src/transformers/pipelines/automatic_speech_recognition.py#L353 + #https://github.com/huggingface/transformers/blob/47c29ccfaf56947d845971a439cbe75a764b63d7/src/transformers/pipelines/audio_utils.py#L10 + audio_array = self._audio_converter.to_ndarray(audioBytes) # Adjust batch size and chunk length based on timestamps and duration. # NOTE: Done to prevent CUDA OOM errors for large audio files. @@ -150,7 +148,7 @@ def __call__(self, audio: UploadFile, duration: float, **kwargs) -> List[File]: ) try: - outputs = self.tm(audioBytes, **kwargs) + outputs = self.tm(audio_array, **kwargs) outputs.setdefault("chunks", []) except torch.cuda.OutOfMemoryError as e: raise e diff --git a/runner/app/pipelines/utils/audio.py b/runner/app/pipelines/utils/audio.py index d3e0d610..4dd074a4 100644 --- a/runner/app/pipelines/utils/audio.py +++ b/runner/app/pipelines/utils/audio.py @@ -5,7 +5,7 @@ from io import BytesIO import av - +import numpy as np class AudioConversionError(Exception): """Raised when an audio file cannot be converted.""" @@ -19,7 +19,46 @@ class AudioConverter: """Converts audio files to different formats.""" @staticmethod - def convert(input_bytes: bytes, output_extension: str, output_codec=None) -> bytes: + def to_ndarray(input_bytes: bytes) -> np.ndarray: + #inspired by https://github.com/SYSTRAN/faster-whisper/blob/d889345e071de21a83bdae60ba4b07110cfd0696/faster_whisper/audio.py + """Converts audio in media file to a NumPy array. + + Args: + input_bytes: The audio file as bytes to convert. + + Returns: + The audio file as a NumPy array. + """ + output_buffer = BytesIO() + input_buffer = BytesIO(input_bytes) + resampler = av.audio.resampler.AudioResampler( + format="s16", + layout="mono", + rate=16000, + ) + + audio_array = None + try: + input_container = av.open(input_buffer, mode="r") + for stream in input_container.streams.audio: + for frame in input_container.decode(stream): + resampled_frame = resampler.resample(frame) + array = resampled_frame[0].to_ndarray() + dtype = array.dtype + output_buffer.write(array) + + audio_array = np.frombuffer(output_buffer.getbuffer(), dtype=dtype) + audio_array = audio_array.astype(np.float32) / 32768.0 + + except Exception as e: + raise AudioConversionError(f"Error during audio conversion to numpy array: {e}") + finally: + input_container.close() + + return audio_array + + @staticmethod + def convert(input_bytes: bytes, output_extension: str, output_codec: str) -> bytes: """Converts an audio file to a different format. Args: