diff --git a/cog.yaml b/cog.yaml index 170baa9..292f8c2 100644 --- a/cog.yaml +++ b/cog.yaml @@ -1,19 +1,17 @@ build: gpu: true - python_version: "3.8" + python_version: 3.8 system_packages: - - "libgl1-mesa-glx" - - "libglib2.0-0" - - "libsndfile1-dev" - - "ffmpeg" + - libsndfile1-dev + - ffmpeg + - libfluidsynth3 + python_packages: - - "ipython==7.30.1" - - "numpy==1.21.4" + - numpy==1.21.4 run: - pip install -U pip - pip install --upgrade cython - - pip install omnizart - - apt-get update && apt-get install -y fluidsynth - pip install pyfluidsynth + - pip install git+https://github.com/e7mac/omnizart.git -predict: "scripts/predict.py:Predictor" +predict: scripts/predict.py:Predictor diff --git a/pyproject.toml b/pyproject.toml index c65b1ab..5673b3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ vamp = "^1.1.0" urllib3 = "1.26.4" spleeter = "^2.3.0" mir_eval = "^0.6" -tensorflow = "2.5.0" +tensorflow = "2.13.0" [tool.poetry.dev-dependencies] pytest = "^5.2" diff --git a/scripts/predict.py b/scripts/predict.py index 01ebdaf..a4555ce 100644 --- a/scripts/predict.py +++ b/scripts/predict.py @@ -9,7 +9,8 @@ import shutil from pathlib import Path -import cog +from cog import BaseModel, BasePredictor, Path, Input +from typing import Optional import scipy.io.wavfile as wave from omnizart.remote import download_large_file_from_google_drive @@ -20,8 +21,13 @@ from omnizart.vocal import app as vapp from omnizart.vocal_contour import app as vcapp +class Output(BaseModel): + midi: Path + wav: Optional[Path] + csv: Optional[Path] + +class Predictor(BasePredictor): -class Predictor(cog.Predictor): def setup(self): self.SF2_FILE = "general_soundfont.sf2" if not os.path.exists(self.SF2_FILE): @@ -34,19 +40,12 @@ def setup(self): self.app = {"music": mapp, "chord": capp, "drum": dapp, "vocal": vapp, "vocal-contour": vcapp, "beat": bapp} self.model_path = {"piano": "Piano", "piano-v2": "PianoV2", "assemble": "Stream", "pop-song": "Pop", "": None} - @cog.input( - "audio", - type=Path, - help="Path to the input music. Supports mp3 and wav format.", - ) - @cog.input( - "mode", - type=str, - default="music-piano-v2", - options=["music-piano", "music-piano-v2", "music-assemble", "chord", "drum", "vocal", "vocal-contour", "beat"], - help="Transcription mode", - ) - def predict(self, audio, mode): + def predict(self, + audio: Path = Input(description="Path to the input music. Supports mp3 and wav format."), + mode: str = Input(default="music-piano-v2", description="Transcription mode", choices=["music-piano", "music-piano-v2", "music-assemble", "chord", "drum", "vocal", "vocal-contour", "beat"]), + render_audio: bool = Input(default=False, description="Option to render to mp3"), + ) -> Output: + """Run a single prediction on the model""" assert str(audio).endswith(".mp3") or str(audio).endswith(".wav"), "Please upload mp3 or wav file." temp_folder = "cog_temp" os.makedirs(temp_folder, exist_ok=True) @@ -65,22 +64,45 @@ def predict(self, audio, mode): app = self.app[mode] model_path = self.model_path[model] - midi = app.transcribe(wav_file_path, model_path=model_path) + midi_path = f"{temp_folder}/{audio_name}.mid" + midi = app.transcribe(wav_file_path, model_path=model_path, output=midi_path) + + mid_out_path = None + audio_out_path = None + csv_out_path = None - if mode == "vocal-contour": - out_name = f"{audio_name}_trans.wav" - else: - print("Synthesizing MIDI...") - out_name = f"{temp_folder}/{audio_name}_synth.wav" - raw_wav = midi.fluidsynth(fs=44100, sf2_path=self.SF2_FILE) - wave.write(out_name, 44100, raw_wav) + if render_audio == True: + if mode == "vocal-contour": + out_name = f"{audio_name}_trans.wav" + else: + print("Synthesizing MIDI...") + out_name = f"{temp_folder}/{audio_name}_synth.wav" + raw_wav = midi.fluidsynth(fs=44100, sf2_path=self.SF2_FILE) + wave.write(out_name, 44100, raw_wav) - out_path = Path(tempfile.mkdtemp()) / "out.mp3" # out_path is automatically cleaned up by cog - subprocess.run(["ffmpeg", "-y", "-i", out_name, str(out_path)]) + audio_out_path = Path(tempfile.mkdtemp()) / "out.mp3" # out_path is automatically cleaned up by cog + subprocess.run(["ffmpeg", "-y", "-i", out_name, str(audio_out_path)]) + + mid_out_path = Path(tempfile.mkdtemp()) / "out.mid" # out_path is automatically cleaned up by cog + shutil.copyfile(midi_path, mid_out_path) + if mode == "chord" : + csv_in_path = str(midi_path).replace(".mid", ".csv") + csv_out_path = str(mid_out_path).replace(".mid", ".csv") + shutil.copyfile(csv_in_path, csv_out_path) + csv_out_path = Path(csv_out_path) finally: shutil.rmtree(temp_folder) if os.path.exists(f"{audio_name}.mid"): os.remove(f"{audio_name}.mid") if os.path.exists(f"{audio_name}_trans.wav"): os.remove(f"{audio_name}_trans.wav") - return out_path + if os.path.exists(f"{audio_name}.csv"): + os.remove(f"{audio_name}.csv") + if mode == "chord": + if render_audio == True: + return Output(midi=mid_out_path, wav=audio_out_path, csv=csv_out_path) + else: + return Output(midi=mid_out_path, csv=csv_out_path) + if render_audio == True: + return Output(midi=mid_out_path, wav=audio_out_path) + return Output(midi=mid_out_path)