Skip to content

Commit

Permalink
updating late release
Browse files Browse the repository at this point in the history
  • Loading branch information
lucashueda committed Jul 5, 2022
1 parent 8ce5089 commit 8108960
Show file tree
Hide file tree
Showing 29 changed files with 139,414 additions and 0 deletions.
1 change: 1 addition & 0 deletions data/lmd/note_seq/audio_test/metadata_ref.json

Large diffs are not rendered by default.

712 changes: 712 additions & 0 deletions data/lmd/note_seq/audio_test/prepare.ipynb

Large diffs are not rendered by default.

15,451 changes: 15,451 additions & 0 deletions data/lmd/note_seq/audio_train/mandando pro active loop.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/lmd/note_seq/audio_train/metadata.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/lmd/note_seq/audio_train/metadata_ref.json

Large diffs are not rendered by default.

118,948 changes: 118,948 additions & 0 deletions data/lmd/note_seq/audio_train/pairs_train

Large diffs are not rendered by default.

572 changes: 572 additions & 0 deletions data/lmd/note_seq/audio_train/pairs_val

Large diffs are not rendered by default.

540 changes: 540 additions & 0 deletions data/lmd/note_seq/audio_train/prepare.ipynb

Large diffs are not rendered by default.

259 changes: 259 additions & 0 deletions data/lmd/note_seq/prepare.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Lakh MIDI Dataset pre-processing\n",
"\n",
"This notebook converts the [Lakh MIDI Dataset](https://colinraffel.com/projects/lmd/) (LMD) to Magenta `NoteSequence` protocol buffers. `INPUT_DIR` is expected to point to the [`lmd_full`](http://hog.ee.columbia.edu/craffel/lmd/lmd_full.tar.gz) directory.\n",
"\n",
"Copyright 2020 InterDigital R&D and Télécom Paris. \n",
"Author: Ondřej Cífka"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import concurrent.futures as cf\n",
"import os\n",
"import pickle\n",
"import sys\n",
"import datetime\n",
"import warnings\n",
"\n",
"import note_seq\n",
"import pretty_midi\n",
"from tqdm.auto import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"INPUT_DIR = '../../../../lmd_full/'\n",
"OUTPUT_DIR = 'data'\n",
"TOTAL_FILES = 178561"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Decrease MAX_TICK value to avoid running out of RAM. Long files will be skipped\n",
"pretty_midi.pretty_midi.MAX_TICK = 1e6"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def get_paths():\n",
" for dirpath, _, filenames in os.walk(INPUT_DIR):\n",
" for filename in filenames:\n",
" yield os.path.join(dirpath, filename)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def process_file(path):\n",
" if os.stat(path).st_size > 100000:\n",
" return None, 0\n",
"\n",
" try:\n",
" with warnings.catch_warnings():\n",
" warnings.filterwarnings('ignore', r'Tempo, Key or Time signature change events found on non-zero tracks')\n",
" ns = note_seq.midi_io.midi_file_to_note_sequence(path)\n",
" except note_seq.midi_io.MIDIConversionError:\n",
" return None, 0\n",
" out_path = os.path.splitext(path)[0] + f'.pickle'\n",
" out_path = OUTPUT_DIR + '/' + os.path.relpath(out_path, INPUT_DIR)\n",
" os.makedirs(os.path.dirname(out_path), exist_ok=True)\n",
" with open(out_path, 'wb') as f:\n",
" pickle.dump(ns, f)\n",
" return out_path, ns.total_time"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'../../../../lmd_full/0\\\\00000ec8a66b6bd2ef809b0443eeae41.mid'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Test get paths\n",
"i = next(iter(get_paths()))\n",
"i"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.path.isfile(i)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('data/0\\\\00000ec8a66b6bd2ef809b0443eeae41.pickle', 12.97296)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"process_file(i)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(178561, '../../../../lmd_full/f\\\\ffffa20c63fb782ebb6b68180e430c8b.mid')"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"j = 0\n",
"for i in get_paths():\n",
" j+= 1\n",
"j, i"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"169556 / 178561 files converted successfully\n",
"Total time: 362 days, 3:02:38.526111\n"
]
}
],
"source": [
"# Running assyncronus\n",
"os.makedirs(OUTPUT_DIR)\n",
"i = 0\n",
"out_paths = []\n",
"nss = []\n",
"for mid_file in get_paths():\n",
" out_path , ns = process_file(mid_file)\n",
" out_paths.append(out_path)\n",
" nss.append(ns)\n",
" i+= 1\n",
" \n",
"print(sum(1 for p in out_paths if p is not None), '/', i, 'files converted successfully')\n",
"print('Total time:', datetime.timedelta(seconds=sum(t for t in nss)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"os.makedirs(OUTPUT_DIR)\n",
"with cf.ProcessPoolExecutor(20) as pool:\n",
" results = list(tqdm(\n",
" pool.map(process_file, tqdm(get_paths(), desc='collect', total=TOTAL_FILES), chunksize=100),\n",
" desc='convert', total=TOTAL_FILES))\n",
"\n",
"print(sum(1 for p, _ in results if p is not None), '/', len(results), 'files converted successfully')\n",
"print('Total time:', datetime.timedelta(seconds=sum(t for _, t in results)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "project",
"language": "python",
"name": "project"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading

0 comments on commit 8108960

Please sign in to comment.