feat: add audio embedding operator using CLAP model

tattle-made · Aug 22, 2024 · 233cfd4 · 233cfd4
1 parent 91f604b
commit 233cfd4
Show file tree

Hide file tree

Showing 4 changed files with 1,513 additions and 0 deletions.
diff --git a/src/core/operators/audio_vec_embedding_clap.py b/src/core/operators/audio_vec_embedding_clap.py
@@ -0,0 +1,53 @@
+"""Operator to get audio representation using LAION-CLAP - https://github.com/LAION-AI/CLAP. """
+
+def initialize(param):
+    """
+    Initializes the operator.
+    
+    Args: 
+        param (dict): A dict to initialize and load the the model.
+    
+    """
+    global model
+    global librosa
+    global np
+    global contextmanager
+    global os
+
+    import numpy as np
+    import librosa
+    from contextlib import contextmanager
+    import os
+    import laion_clap
+
+    model = laion_clap.CLAP_Module()
+    model.load_ckpt() # load the best checkpoint (HTSAT model) in the paper.
+    print("model successfully downloaded")
+
+
+def run(audio_file):
+    """
+    Runs the operator and compute inference on the audio file.
+
+    Args:
+        audio_file (dict): `AudioFactory` file object.
+
+    Returns:
+        audio_emb (numpy.ndarray): A 512-length vector embedding representing the audio. 
+
+    """
+    audio = audio_file["path"]
+
+    @contextmanager
+    def audio_load(fname):
+        a, _ = librosa.load(fname, sr=48000)
+        try:
+            yield a
+        finally:
+            os.remove(fname)
+
+    with audio_load(audio) as audio_var:
+        query_audio = audio_var.reshape(1, -1)
+        audio_emb = model.get_audio_embedding_from_data(x = query_audio, use_tensor=False)
+        audio_emb = audio_emb.reshape(-1)
+        return audio_emb
diff --git a/src/core/operators/audio_vec_embedding_clap_requirements.in b/src/core/operators/audio_vec_embedding_clap_requirements.in
@@ -0,0 +1,3 @@
+laion-clap==1.1.6
+librosa==0.10.2.post1
+torchvision==0.19.0