diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d0ae591 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Paul van Gent + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..cd82150 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +# Bytehoven - Sheet music recognition with ResNet50 + +![Byethoven](images/Bytehoven.jpeg) + +This repository contains the resources used in the development of Bytehoven, a deep learning sheet music recognition model currently in development. [Please find the full tutorial here.](http://www.paulvangent.com/2017/12/07/deep-learning-music/) The current version recognises piano music from Bach, Beethoven, Brahms, Chopin, Grieg, Liszt, and Mozart. + +# Included files + +- datasets/Musicdata_Small.rar -- Dataset of small sized images (200*35px) +- datasets/Musicdata_Medium.rar -- Dataset of medium sized images (400*70px) +- model-weights/bytehoven-7-weights.hdf5 -- Model weights trained on medium sized set (full training log included) +- ResNet50.py -- ResNet50 architecture implemented in Keras +- run_ResNet50.py -- Example to initiate training run \ No newline at end of file diff --git a/ResNet50.py b/ResNet50.py new file mode 100644 index 0000000..681abe7 --- /dev/null +++ b/ResNet50.py @@ -0,0 +1,146 @@ +''' +Python file defining the ResNet50 architecture used in the project. + +2017 - Paul van Gent +Adapted from https://github.com/fchollet/keras/blob/master/keras/applications/resnet50.py + +Licensed under the MIT Licens. Permission is hereby granted, free of charge, +to any person obtaining a copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom +the Software is furnished to do so, subject to the following conditions: + +- The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. +''' + +import os +import random + +#Disable GPU (out of memory errors) +#os.environ['CUDA_VISIBLE_DEVICES'] = '-1' + +import numpy as np +from keras import layers +from keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D +from keras.models import Model +import keras.backend as K + +def identity_block(input_tensor, kernel_size, filters, stage, block): + """The identity block is the block that has no conv layer at shortcut. + # Arguments + input_tensor: input tensor + kernel_size: default 3, the kernel size of middle conv layer at main path + filters: list of integers, the filters of 3 conv layer at main path + stage: integer, current stage label, used for generating layer names + block: 'a','b'..., current block label, used for generating layer names + # Returns + Output tensor for the block. + """ + filters1, filters2, filters3 = filters + if K.image_data_format() == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 + conv_name_base = 'res' + str(stage) + block + '_branch' + bn_name_base = 'bn' + str(stage) + block + '_branch' + + x = Conv2D(filters1, (1, 1), name=conv_name_base + '2a')(input_tensor) + x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x) + x = Activation('relu')(x) + + x = Conv2D(filters2, kernel_size, + padding='same', name=conv_name_base + '2b')(x) + x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x) + x = Activation('relu')(x) + + x = Conv2D(filters3, (1, 1), name=conv_name_base + '2c')(x) + x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x) + + x = layers.add([x, input_tensor]) + x = Activation('relu')(x) + return x + +def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)): + """A block that has a conv layer at shortcut. + # Arguments + input_tensor: input tensor + kernel_size: default 3, the kernel size of middle conv layer at main path + filters: list of integers, the filters of 3 conv layer at main path + stage: integer, current stage label, used for generating layer names + block: 'a','b'..., current block label, used for generating layer names + # Returns + Output tensor for the block. + Note that from stage 3, the first conv layer at main path is with strides=(2,2) + And the shortcut should have strides=(2,2) as well + """ + filters1, filters2, filters3 = filters + if K.image_data_format() == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 + conv_name_base = 'res' + str(stage) + block + '_branch' + bn_name_base = 'bn' + str(stage) + block + '_branch' + + x = Conv2D(filters1, (1, 1), strides=strides, + name=conv_name_base + '2a')(input_tensor) + x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x) + x = Activation('relu')(x) + + x = Conv2D(filters2, kernel_size, padding='same', + name=conv_name_base + '2b')(x) + x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x) + x = Activation('relu')(x) + + x = Conv2D(filters3, (1, 1), name=conv_name_base + '2c')(x) + x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x) + + shortcut = Conv2D(filters3, (1, 1), strides=strides, + name=conv_name_base + '1')(input_tensor) + shortcut = BatchNormalization(axis=bn_axis, name=bn_name_base + '1')(shortcut) + + x = layers.add([x, shortcut]) + x = Activation('relu')(x) + return x + + +def ResNet50(input_shape=None, classes=1000): + + x_input = Input(input_shape) + x = ZeroPadding2D((3, 3))(x_input) + + x = Conv2D( + 64, (7, 7), strides=(2, 2), padding='same', name='conv1')(x) + X = BatchNormalization(axis = 3, name = 'bn_conv1')(x) + x = Activation('relu')(x) + x = MaxPooling2D((3, 3), strides=(2, 2))(x) + + x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) + x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') + x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') + + x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') + x = identity_block(x, 3, [128, 128, 512], stage=3, block='b') + x = identity_block(x, 3, [128, 128, 512], stage=3, block='c') + x = identity_block(x, 3, [128, 128, 512], stage=3, block='d') + + x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f') + + x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') + x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') + x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') + + #x = AveragePooling2D((7, 7), name='avg_pool')(x) + + x = Flatten()(x) + x = Dense(classes, activation='softmax', name='fc')(x) + + # Create model. + model = Model(inputs = x_input, outputs = x, name='resnet50') + + return model diff --git a/datasets/Musicdata_Medium.rar b/datasets/Musicdata_Medium.rar new file mode 100644 index 0000000..bca3175 Binary files /dev/null and b/datasets/Musicdata_Medium.rar differ diff --git a/datasets/Musicdata_Small.rar b/datasets/Musicdata_Small.rar new file mode 100644 index 0000000..d0fb468 Binary files /dev/null and b/datasets/Musicdata_Small.rar differ diff --git a/images/Beethoven_Visualisation.jpg b/images/Beethoven_Visualisation.jpg new file mode 100644 index 0000000..d216bd6 Binary files /dev/null and b/images/Beethoven_Visualisation.jpg differ diff --git a/images/Bytehoven.jpeg b/images/Bytehoven.jpeg new file mode 100644 index 0000000..8e3e9a3 Binary files /dev/null and b/images/Bytehoven.jpeg differ diff --git a/images/Chopin_Visualisation.jpg b/images/Chopin_Visualisation.jpg new file mode 100644 index 0000000..ea31586 Binary files /dev/null and b/images/Chopin_Visualisation.jpg differ diff --git a/images/Filters.jpg b/images/Filters.jpg new file mode 100644 index 0000000..6731205 Binary files /dev/null and b/images/Filters.jpg differ diff --git a/run_ResNet50.py b/run_ResNet50.py new file mode 100644 index 0000000..7e11777 --- /dev/null +++ b/run_ResNet50.py @@ -0,0 +1,83 @@ +import numpy as np +from glob import glob +from scipy import ndimage +from keras import callbacks +from keras.optimizers import Adamax, SGD, RMSprop + +import ResNet50 + +def convert_to_one_hot(Y, C): + '''Converts array with labels to one-hot encoding + + Keyword Arguments: + Y -- 1-dimensional numpy array containing labeled values + C -- total number of labels in Y + ''' + + Y = np.eye(C)[Y.reshape(-1)].T + return Y + +def load_dataset(datapath, composers): + '''Loads dataset into memory + + Keyword Arguments: + datapath -- absolute or relative path to dataset location + composers -- list of composer names included in the dataset + ''' + + folders = glob('%s/*' %datapath) + X_train = [] + Y_train = [] + + for folder in folders: + files = glob('%s\\*.jpg' %folder) + print('working on composer: %s' %(folder.split('\\')[-1])) + for f in files: + im = ndimage.imread(f, mode='L') + im = im/255 + im = im.reshape(im.shape[0], im.shape[1], 1) + X_train.append(im) + Y_train.append(composers.index(folder.split('\\')[-1])) + + return np.asarray(X_train), np.asarray(Y_train) + +if __name__ == '__main__': + print('setting model') + model = ResNet50.ResNet50(input_shape = (70, 400, 1), classes = 7) + + epochs = 100 + learning_rate = 0.001 + lr_decay = 0.001/100 + + print('compiling model...') + #optimizer_instance = Adam(lr=learning_rate, decay=lr_decay)#lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=0.001) + #optimizer_instance = Adamax(lr=learning_rate, decay=lr_decay) + optimizer_instance = SGD(lr=learning_rate, decay=lr_decay) + #optimizer_instance = RMSprop(lr=learning_rate, decay=lr_decay) + + model.compile(optimizer=optimizer_instance, loss='categorical_crossentropy', metrics=['acc']) + + print('loading dataset......') + composers = ['Bach', 'Beethoven', 'Brahms', 'Chopin', 'Grieg', 'Liszt', 'Mozart'] + datapath = 'Dataset_Train_Medium/' + X_train, Y_train = load_dataset(datapath, composers) + + datapath_val = 'Dataset_Dev_Medium/' + X_test, Y_test = load_dataset(datapath_val, composers) + + print('applying one-hot-encoding') + Y_train = convert_to_one_hot(Y_train, 7).T + Y_test = convert_to_one_hot(Y_test, 7).T + + print('setting up callbacks...') + nancheck = callbacks.TerminateOnNaN() + filepath = 'Models/weights-improvement-{epoch:02d}-{acc:.2f}.hdf5' + saver = callbacks.ModelCheckpoint(filepath, monitor='acc', verbose=1, save_best_only=False, mode='max', period=1) + logger = callbacks.CSVLogger('model-weights/trainingresults.log') + callbacklist = [nancheck, saver, logger] + + print('starting model fitting') + model.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs=epochs, batch_size=72, callbacks=callbacklist) + + print('Saving model.........') + model.save('second_run.h5') \ No newline at end of file