Skip to content

Commit

Permalink
feat: add audio embedding operator using CLAP model
Browse files Browse the repository at this point in the history
  • Loading branch information
Chaithanya512 committed Aug 22, 2024
1 parent 91f604b commit 233cfd4
Show file tree
Hide file tree
Showing 4 changed files with 1,513 additions and 0 deletions.
53 changes: 53 additions & 0 deletions src/core/operators/audio_vec_embedding_clap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Operator to get audio representation using LAION-CLAP - https://github.com/LAION-AI/CLAP. """

def initialize(param):
"""
Initializes the operator.
Args:
param (dict): A dict to initialize and load the the model.
"""
global model
global librosa
global np
global contextmanager
global os

import numpy as np
import librosa
from contextlib import contextmanager
import os
import laion_clap

model = laion_clap.CLAP_Module()
model.load_ckpt() # load the best checkpoint (HTSAT model) in the paper.
print("model successfully downloaded")


def run(audio_file):
"""
Runs the operator and compute inference on the audio file.
Args:
audio_file (dict): `AudioFactory` file object.
Returns:
audio_emb (numpy.ndarray): A 512-length vector embedding representing the audio.
"""
audio = audio_file["path"]

@contextmanager
def audio_load(fname):
a, _ = librosa.load(fname, sr=48000)
try:
yield a
finally:
os.remove(fname)

with audio_load(audio) as audio_var:
query_audio = audio_var.reshape(1, -1)
audio_emb = model.get_audio_embedding_from_data(x = query_audio, use_tensor=False)
audio_emb = audio_emb.reshape(-1)
return audio_emb
3 changes: 3 additions & 0 deletions src/core/operators/audio_vec_embedding_clap_requirements.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
laion-clap==1.1.6
librosa==0.10.2.post1
torchvision==0.19.0
Loading

0 comments on commit 233cfd4

Please sign in to comment.