Skip to content

Commit

Permalink
Merge pull request #372 from Chaithanya512/audio_vec_emb_clap
Browse files Browse the repository at this point in the history
feat: audio embedding operator using CLAP model
  • Loading branch information
aatmanvaidya authored Aug 23, 2024
2 parents 91f604b + 233cfd4 commit faa9727
Show file tree
Hide file tree
Showing 4 changed files with 1,513 additions and 0 deletions.
53 changes: 53 additions & 0 deletions src/core/operators/audio_vec_embedding_clap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Operator to get audio representation using LAION-CLAP - https://github.com/LAION-AI/CLAP. """

def initialize(param):
"""
Initializes the operator.
Args:
param (dict): A dict to initialize and load the the model.
"""
global model
global librosa
global np
global contextmanager
global os

import numpy as np
import librosa
from contextlib import contextmanager
import os
import laion_clap

model = laion_clap.CLAP_Module()
model.load_ckpt() # load the best checkpoint (HTSAT model) in the paper.
print("model successfully downloaded")


def run(audio_file):
"""
Runs the operator and compute inference on the audio file.
Args:
audio_file (dict): `AudioFactory` file object.
Returns:
audio_emb (numpy.ndarray): A 512-length vector embedding representing the audio.
"""
audio = audio_file["path"]

@contextmanager
def audio_load(fname):
a, _ = librosa.load(fname, sr=48000)
try:
yield a
finally:
os.remove(fname)

with audio_load(audio) as audio_var:
query_audio = audio_var.reshape(1, -1)
audio_emb = model.get_audio_embedding_from_data(x = query_audio, use_tensor=False)
audio_emb = audio_emb.reshape(-1)
return audio_emb
3 changes: 3 additions & 0 deletions src/core/operators/audio_vec_embedding_clap_requirements.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
laion-clap==1.1.6
librosa==0.10.2.post1
torchvision==0.19.0
Loading

0 comments on commit faa9727

Please sign in to comment.