-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8ce5089
commit 8108960
Showing
29 changed files
with
139,414 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
15,451 changes: 15,451 additions & 0 deletions
15,451
data/lmd/note_seq/audio_train/mandando pro active loop.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,259 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Lakh MIDI Dataset pre-processing\n", | ||
"\n", | ||
"This notebook converts the [Lakh MIDI Dataset](https://colinraffel.com/projects/lmd/) (LMD) to Magenta `NoteSequence` protocol buffers. `INPUT_DIR` is expected to point to the [`lmd_full`](http://hog.ee.columbia.edu/craffel/lmd/lmd_full.tar.gz) directory.\n", | ||
"\n", | ||
"Copyright 2020 InterDigital R&D and Télécom Paris. \n", | ||
"Author: Ondřej Cífka" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import concurrent.futures as cf\n", | ||
"import os\n", | ||
"import pickle\n", | ||
"import sys\n", | ||
"import datetime\n", | ||
"import warnings\n", | ||
"\n", | ||
"import note_seq\n", | ||
"import pretty_midi\n", | ||
"from tqdm.auto import tqdm" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"INPUT_DIR = '../../../../lmd_full/'\n", | ||
"OUTPUT_DIR = 'data'\n", | ||
"TOTAL_FILES = 178561" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Decrease MAX_TICK value to avoid running out of RAM. Long files will be skipped\n", | ||
"pretty_midi.pretty_midi.MAX_TICK = 1e6" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def get_paths():\n", | ||
" for dirpath, _, filenames in os.walk(INPUT_DIR):\n", | ||
" for filename in filenames:\n", | ||
" yield os.path.join(dirpath, filename)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def process_file(path):\n", | ||
" if os.stat(path).st_size > 100000:\n", | ||
" return None, 0\n", | ||
"\n", | ||
" try:\n", | ||
" with warnings.catch_warnings():\n", | ||
" warnings.filterwarnings('ignore', r'Tempo, Key or Time signature change events found on non-zero tracks')\n", | ||
" ns = note_seq.midi_io.midi_file_to_note_sequence(path)\n", | ||
" except note_seq.midi_io.MIDIConversionError:\n", | ||
" return None, 0\n", | ||
" out_path = os.path.splitext(path)[0] + f'.pickle'\n", | ||
" out_path = OUTPUT_DIR + '/' + os.path.relpath(out_path, INPUT_DIR)\n", | ||
" os.makedirs(os.path.dirname(out_path), exist_ok=True)\n", | ||
" with open(out_path, 'wb') as f:\n", | ||
" pickle.dump(ns, f)\n", | ||
" return out_path, ns.total_time" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'../../../../lmd_full/0\\\\00000ec8a66b6bd2ef809b0443eeae41.mid'" | ||
] | ||
}, | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# Test get paths\n", | ||
"i = next(iter(get_paths()))\n", | ||
"i" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"True" | ||
] | ||
}, | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"os.path.isfile(i)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 12, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"('data/0\\\\00000ec8a66b6bd2ef809b0443eeae41.pickle', 12.97296)" | ||
] | ||
}, | ||
"execution_count": 12, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"process_file(i)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 15, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"(178561, '../../../../lmd_full/f\\\\ffffa20c63fb782ebb6b68180e430c8b.mid')" | ||
] | ||
}, | ||
"execution_count": 15, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"j = 0\n", | ||
"for i in get_paths():\n", | ||
" j+= 1\n", | ||
"j, i" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 16, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"169556 / 178561 files converted successfully\n", | ||
"Total time: 362 days, 3:02:38.526111\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Running assyncronus\n", | ||
"os.makedirs(OUTPUT_DIR)\n", | ||
"i = 0\n", | ||
"out_paths = []\n", | ||
"nss = []\n", | ||
"for mid_file in get_paths():\n", | ||
" out_path , ns = process_file(mid_file)\n", | ||
" out_paths.append(out_path)\n", | ||
" nss.append(ns)\n", | ||
" i+= 1\n", | ||
" \n", | ||
"print(sum(1 for p in out_paths if p is not None), '/', i, 'files converted successfully')\n", | ||
"print('Total time:', datetime.timedelta(seconds=sum(t for t in nss)))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"os.makedirs(OUTPUT_DIR)\n", | ||
"with cf.ProcessPoolExecutor(20) as pool:\n", | ||
" results = list(tqdm(\n", | ||
" pool.map(process_file, tqdm(get_paths(), desc='collect', total=TOTAL_FILES), chunksize=100),\n", | ||
" desc='convert', total=TOTAL_FILES))\n", | ||
"\n", | ||
"print(sum(1 for p, _ in results if p is not None), '/', len(results), 'files converted successfully')\n", | ||
"print('Total time:', datetime.timedelta(seconds=sum(t for _, t in results)))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "project", | ||
"language": "python", | ||
"name": "project" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
Oops, something went wrong.