updating late release

lucashueda · Jul 5, 2022 · 8108960 · 8108960
1 parent 8ce5089
commit 8108960
Show file tree

Hide file tree

Showing 29 changed files with 139,414 additions and 0 deletions.
diff --git a/data/lmd/note_seq/audio_test/metadata_ref.json b/data/lmd/note_seq/audio_test/metadata_ref.json
diff --git a/data/lmd/note_seq/audio_test/prepare.ipynb b/data/lmd/note_seq/audio_test/prepare.ipynb
diff --git a/data/lmd/note_seq/audio_train/mandando pro active loop.ipynb b/data/lmd/note_seq/audio_train/mandando pro active loop.ipynb
diff --git a/data/lmd/note_seq/audio_train/metadata.json b/data/lmd/note_seq/audio_train/metadata.json
diff --git a/data/lmd/note_seq/audio_train/metadata_ref.json b/data/lmd/note_seq/audio_train/metadata_ref.json
diff --git a/data/lmd/note_seq/audio_train/pairs_train b/data/lmd/note_seq/audio_train/pairs_train
diff --git a/data/lmd/note_seq/audio_train/pairs_val b/data/lmd/note_seq/audio_train/pairs_val
diff --git a/data/lmd/note_seq/audio_train/prepare.ipynb b/data/lmd/note_seq/audio_train/prepare.ipynb
diff --git a/data/lmd/note_seq/prepare.ipynb b/data/lmd/note_seq/prepare.ipynb
@@ -0,0 +1,259 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Lakh MIDI Dataset pre-processing\n",
+    "\n",
+    "This notebook converts the [Lakh MIDI Dataset](https://colinraffel.com/projects/lmd/) (LMD) to Magenta `NoteSequence` protocol buffers. `INPUT_DIR` is expected to point to the [`lmd_full`](http://hog.ee.columbia.edu/craffel/lmd/lmd_full.tar.gz) directory.\n",
+    "\n",
+    "Copyright 2020 InterDigital R&D and Télécom Paris.  \n",
+    "Author: Ondřej Cífka"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import concurrent.futures as cf\n",
+    "import os\n",
+    "import pickle\n",
+    "import sys\n",
+    "import datetime\n",
+    "import warnings\n",
+    "\n",
+    "import note_seq\n",
+    "import pretty_midi\n",
+    "from tqdm.auto import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "INPUT_DIR = '../../../../lmd_full/'\n",
+    "OUTPUT_DIR = 'data'\n",
+    "TOTAL_FILES = 178561"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Decrease MAX_TICK value to avoid running out of RAM. Long files will be skipped\n",
+    "pretty_midi.pretty_midi.MAX_TICK = 1e6"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_paths():\n",
+    "    for dirpath, _, filenames in os.walk(INPUT_DIR):\n",
+    "        for filename in filenames:\n",
+    "            yield os.path.join(dirpath, filename)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_file(path):\n",
+    "    if os.stat(path).st_size > 100000:\n",
+    "        return None, 0\n",
+    "\n",
+    "    try:\n",
+    "        with warnings.catch_warnings():\n",
+    "            warnings.filterwarnings('ignore', r'Tempo, Key or Time signature change events found on non-zero tracks')\n",
+    "            ns = note_seq.midi_io.midi_file_to_note_sequence(path)\n",
+    "    except note_seq.midi_io.MIDIConversionError:\n",
+    "        return None, 0\n",
+    "    out_path = os.path.splitext(path)[0] + f'.pickle'\n",
+    "    out_path = OUTPUT_DIR + '/' + os.path.relpath(out_path, INPUT_DIR)\n",
+    "    os.makedirs(os.path.dirname(out_path), exist_ok=True)\n",
+    "    with open(out_path, 'wb') as f:\n",
+    "        pickle.dump(ns, f)\n",
+    "    return out_path, ns.total_time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'../../../../lmd_full/0\\\\00000ec8a66b6bd2ef809b0443eeae41.mid'"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Test get paths\n",
+    "i = next(iter(get_paths()))\n",
+    "i"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "os.path.isfile(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('data/0\\\\00000ec8a66b6bd2ef809b0443eeae41.pickle', 12.97296)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "process_file(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(178561, '../../../../lmd_full/f\\\\ffffa20c63fb782ebb6b68180e430c8b.mid')"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "j = 0\n",
+    "for i in get_paths():\n",
+    "    j+= 1\n",
+    "j, i"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "169556 / 178561 files converted successfully\n",
+      "Total time: 362 days, 3:02:38.526111\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Running assyncronus\n",
+    "os.makedirs(OUTPUT_DIR)\n",
+    "i = 0\n",
+    "out_paths = []\n",
+    "nss = []\n",
+    "for mid_file in get_paths():\n",
+    "    out_path , ns = process_file(mid_file)\n",
+    "    out_paths.append(out_path)\n",
+    "    nss.append(ns)\n",
+    "    i+= 1\n",
+    "    \n",
+    "print(sum(1 for p in out_paths if p is not None), '/', i, 'files converted successfully')\n",
+    "print('Total time:', datetime.timedelta(seconds=sum(t for t in nss)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "os.makedirs(OUTPUT_DIR)\n",
+    "with cf.ProcessPoolExecutor(20) as pool:\n",
+    "    results = list(tqdm(\n",
+    "        pool.map(process_file, tqdm(get_paths(), desc='collect', total=TOTAL_FILES), chunksize=100),\n",
+    "        desc='convert', total=TOTAL_FILES))\n",
+    "\n",
+    "print(sum(1 for p, _ in results if p is not None), '/', len(results), 'files converted successfully')\n",
+    "print('Total time:', datetime.timedelta(seconds=sum(t for _, t in results)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "project",
+   "language": "python",
+   "name": "project"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}