diff --git a/.gitignore b/.gitignore index 977860d6..44740a36 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ #dropbox stuff *.dropbox* +.idea/* # Byte-compiled / optimized / DLL files __pycache__/ @@ -58,3 +59,10 @@ docs/_build/ # PyBuilder target/ + +# Pycharm +.idea/* + + +#Notebook stuff +notebooks/.ipynb_checkpoints/ diff --git a/MLP2022_23_CW2_Spec.pdf b/MLP2022_23_CW2_Spec.pdf new file mode 100644 index 00000000..0fc690e7 Binary files /dev/null and b/MLP2022_23_CW2_Spec.pdf differ diff --git a/README.md b/README.md index e25cd6c9..8870d148 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Machine Learning Practical -This repository contains the code for the University of Edinburgh [School of Informatics](http://www.inf.ed.ac.uk) course [Machine Learning Practical](http://www.inf.ed.ac.uk/teaching/courses/mlp/). +This repository contains the code for the University of Edinburgh [School of Informatics](http://www.inf.ed.ac.uk) course Machine Learning Practical. This assignment-based course is focused on the implementation and evaluation of machine learning systems. Students who do this course will have experience in the design, implementation, training, and evaluation of machine learning systems. @@ -16,3 +16,4 @@ If you are working remotely, follow this [guide](notes/remote-working-guide.md). ## Getting set up Detailed instructions for setting up a development environment for the course are given in [this file](notes/environment-set-up.md). Students doing the course will spend part of the first lab getting their own environment set up. + diff --git a/VGG_08/result_outputs/summary.csv b/VGG_08/result_outputs/summary.csv new file mode 100644 index 00000000..d4c46384 --- /dev/null +++ b/VGG_08/result_outputs/summary.csv @@ -0,0 +1,102 @@ +train_acc,train_loss,val_acc,val_loss +0.010694736842105264,4.827323,0.024800000000000003,4.5659676 +0.03562105263157895,4.3888855,0.0604,4.136276 +0.0757684210526316,3.998175,0.09480000000000001,3.8678854 +0.10734736842105265,3.784943,0.12159999999999999,3.6687074 +0.13741052631578948,3.6023798,0.15439999999999998,3.4829779 +0.16888421052631578,3.4196754,0.1864,3.3093607 +0.1941263157894737,3.2674048,0.20720000000000002,3.2223148 +0.21861052631578948,3.139925,0.22880000000000003,3.1171055 +0.24134736842105264,3.0145736,0.24760000000000001,3.0554724 +0.26399999999999996,2.9004965,0.2552,2.9390912 +0.27898947368421056,2.815607,0.2764,2.9205213 +0.29532631578947366,2.7256868,0.2968,2.7410471 +0.31138947368421044,2.6567938,0.3016,2.7083752 +0.3236842105263158,2.595405,0.322,2.665904 +0.33486315789473686,2.5434496,0.3176,2.688214 +0.3462526315789474,2.5021079,0.33159999999999995,2.648656 +0.35381052631578946,2.4609485,0.342,2.5658453 +0.36157894736842106,2.4152951,0.34119999999999995,2.5403407 +0.36774736842105266,2.382958,0.3332,2.6936982 +0.37753684210526317,2.3510027,0.36160000000000003,2.4663532 +0.38597894736842114,2.319616,0.3608,2.4559999 +0.3912421052631579,2.294115,0.3732,2.3644555 +0.39840000000000003,2.2598042,0.3716,2.4516551 +0.4036,2.2318766,0.37439999999999996,2.4189563 +0.4105263157894737,2.2035582,0.3772,2.3899698 +0.41501052631578944,2.1830406,0.3876,2.3215945 +0.4193263157894737,2.158597,0.37800000000000006,2.3831298 +0.4211578947368421,2.148888,0.38160000000000005,2.3436418 +0.4260842105263159,2.1250536,0.39840000000000003,2.3471045 +0.4313684210526315,2.107519,0.4044,2.2744477 +0.4370526315789474,2.0837262,0.398,2.245617 +0.439642105263158,2.0691078,0.41200000000000003,2.216309 +0.4440842105263158,2.046351,0.4096,2.2329648 +0.44696842105263157,2.0330904,0.4104,2.1841388 +0.4518105263157895,2.0200553,0.4244,2.1780539 +0.45298947368421055,2.0069249,0.42719999999999997,2.1625984 +0.4602105263157895,1.9896894,0.4204,2.2195568 +0.46023157894736844,1.9788533,0.4244,2.1803434 +0.46101052631578954,1.9693571,0.4128,2.1858895 +0.46774736842105263,1.9547894,0.4204,2.1908271 +0.4671157894736842,1.9390026,0.4244,2.1841395 +0.4698105263157895,1.924038,0.424,2.1843896 +0.4738736842105264,1.9161719,0.43,2.154806 +0.47541052631578945,1.9033127,0.4463999999999999,2.1130056 +0.48,1.8961077,0.44439999999999996,2.113019 +0.48456842105263154,1.8838875,0.43079999999999996,2.1191697 +0.4857263157894737,1.8711865,0.44920000000000004,2.1213412 +0.4887578947368421,1.8590263,0.44799999999999995,2.1077166 +0.49035789473684216,1.8479114,0.4428,2.0737479 +0.4908421052631579,1.845268,0.4436,2.07655 +0.4939368421052632,1.8336699,0.4548,2.0769904 +0.49924210526315793,1.8237538,0.4548,2.061769 +0.49677894736842104,1.8111013,0.44240000000000007,2.0676718 +0.5008842105263157,1.8031327,0.4548,2.0859065 +0.5,1.8026625,0.458,2.0704215 +0.5030736842105263,1.792004,0.4596,2.1113508 +0.505578947368421,1.7810374,0.45679999999999993,2.0382714 +0.5090315789473684,1.7691813,0.4444000000000001,2.0911386 +0.512042105263158,1.7633294,0.4616,2.0458508 +0.5142736842105263,1.7549652,0.4464,2.0786576 +0.5128421052631579,1.7518128,0.4656,2.026332 +0.518042105263158,1.7420768,0.46,2.0141299 +0.5182315789473684,1.7321203,0.45960000000000006,2.0226884 +0.5192842105263158,1.7264535,0.46279999999999993,2.0182638 +0.5217894736842105,1.7245325,0.46399999999999997,2.0110855 +0.5229684210526316,1.7184331,0.46679999999999994,2.0191038 +0.5227578947368421,1.7116771,0.4604,2.0334535 +0.5245894736842105,1.7009526,0.4692,2.0072439 +0.5262315789473684,1.6991171,0.4700000000000001,2.0296187 +0.5278526315789474,1.6958193,0.4708,1.9912667 +0.527157894736842,1.6907407,0.4736,2.006095 +0.5299578947368421,1.6808176,0.4715999999999999,2.012164 +0.5313052631578947,1.676356,0.47239999999999993,1.9955354 +0.5338315789473685,1.6731659,0.47839999999999994,2.005768 +0.5336000000000001,1.662152,0.4672,2.015392 +0.5354736842105263,1.6638054,0.4692,1.9890119 +0.5397894736842105,1.6575475,0.4768,2.0090258 +0.5386526315789474,1.6595734,0.4824,1.9728817 +0.5376631578947368,1.6536722,0.4816,1.9769167 +0.5384842105263159,1.6495628,0.47600000000000003,1.9980135 +0.5380842105263157,1.6488388,0.478,1.9884782 +0.5393473684210528,1.6408547,0.48,1.9772192 +0.5415157894736843,1.632917,0.4828,1.9732709 +0.5394947368421052,1.6340653,0.4776,1.9623082 +0.5429052631578948,1.6340532,0.47759999999999997,1.9812362 +0.5452421052631579,1.6246406,0.48119999999999996,1.9846246 +0.5436210526315789,1.6288266,0.4864,1.9822198 +0.5437684210526316,1.6240481,0.48279999999999995,1.9768158 +0.546357894736842,1.6208181,0.4804,1.9625885 +0.5485052631578946,1.6164333,0.47839999999999994,1.9738724 +0.5466736842105263,1.6169226,0.47800000000000004,1.9842362 +0.547621052631579,1.6159856,0.4828,1.9709526 +0.5480421052631579,1.6175526,0.48560000000000003,1.967775 +0.5468421052631579,1.6149833,0.48119999999999996,1.9626708 +0.5493894736842105,1.6063902,0.4835999999999999,1.96621 +0.5490736842105263,1.6096952,0.48120000000000007,1.9742922 +0.5514736842105264,1.6084315,0.4867999999999999,1.9604725 +0.5489263157894737,1.6069487,0.4831999999999999,1.9733659 +0.5494947368421053,1.6030664,0.49079999999999996,1.9693874 +0.5516842105263158,1.6043342,0.486,1.9647765 +0.552442105263158,1.6039867,0.48480000000000006,1.9649359 diff --git a/VGG_08/result_outputs/test_summary.csv b/VGG_08/result_outputs/test_summary.csv new file mode 100644 index 00000000..f19765f0 --- /dev/null +++ b/VGG_08/result_outputs/test_summary.csv @@ -0,0 +1,2 @@ +test_acc,test_loss +0.49950000000000006,1.9105633 diff --git a/VGG_38/result_outputs/summary.csv b/VGG_38/result_outputs/summary.csv new file mode 100644 index 00000000..e5a38853 --- /dev/null +++ b/VGG_38/result_outputs/summary.csv @@ -0,0 +1,101 @@ +train_acc,train_loss,val_acc,val_loss +0.009263157894736843,4.8649125,0.0104,4.630689 +0.009810526315789474,4.6264124,0.009600000000000001,4.618983 +0.009705263157894738,4.621914,0.011200000000000002,4.6184525 +0.008989473684210525,4.619472,0.0064,4.6164784 +0.009747368421052633,4.6168556,0.0076,4.6138463 +0.00951578947368421,4.6156826,0.0108,4.6139345 +0.009789473684210525,4.614809,0.008400000000000001,4.6116896 +0.009936842105263159,4.613147,0.0104,4.6148276 +0.009810526315789474,4.612325,0.0076,4.6123877 +0.009094736842105263,4.6117926,0.007200000000000001,4.6149993 +0.008421052631578947,4.611283,0.011600000000000001,4.6114736 +0.009010526315789472,4.6105323,0.009600000000000001,4.607559 +0.009894736842105263,4.6103206,0.008400000000000001,4.6086206 +0.00934736842105263,4.6095214,0.011200000000000002,4.6091933 +0.009473684210526316,4.6095295,0.008,4.6095695 +0.010252631578947369,4.609189,0.0104,4.610459 +0.009536842105263158,4.6087623,0.0092,4.6091356 +0.00848421052631579,4.6086617,0.009600000000000001,4.609126 +0.008421052631578947,4.6083455,0.011200000000000002,4.6088147 +0.009410526315789473,4.608145,0.0068000000000000005,4.608519 +0.009263157894736843,4.6078997,0.0092,4.6085033 +0.009389473684210526,4.607453,0.01,4.6083508 +0.008989473684210528,4.6075597,0.008400000000000001,4.6073136 +0.009326315789473686,4.607266,0.008,4.6069093 +0.01,4.607154,0.0076,4.6069508 +0.008778947368421053,4.607089,0.011200000000000002,4.60659 +0.009326315789473684,4.606807,0.0068,4.6072598 +0.009031578947368422,4.6068263,0.011200000000000002,4.607257 +0.008842105263157896,4.6066294,0.008,4.606883 +0.008968421052631579,4.606647,0.006400000000000001,4.607275 +0.008947368421052631,4.6065364,0.0092,4.606976 +0.008842105263157896,4.6064167,0.0076,4.607016 +0.008799999999999999,4.606425,0.0096,4.607184 +0.009326315789473686,4.606305,0.0072,4.6068683 +0.00905263157894737,4.606274,0.0072,4.606982 +0.00934736842105263,4.6062336,0.007200000000000001,4.607209 +0.009221052631578948,4.606221,0.0076,4.607369 +0.009557894736842105,4.60607,0.0076,4.6074376 +0.009073684210526317,4.6061006,0.0072,4.607068 +0.009242105263157895,4.606005,0.0064,4.6067224 +0.009957894736842107,4.605986,0.0072,4.6068263 +0.009052631578947368,4.605935,0.0072,4.6067867 +0.008694736842105264,4.6059127,0.0064,4.6070905 +0.009536842105263158,4.605874,0.006400000000000001,4.606976 +0.009663157894736842,4.605872,0.0072,4.6068897 +0.008821052631578948,4.6057997,0.0064,4.607028 +0.009768421052631579,4.605778,0.0072,4.6069264 +0.0092,4.6057644,0.007200000000000001,4.607018 +0.008926315789473685,4.6057386,0.0072,4.60698 +0.008989473684210525,4.6057277,0.0064,4.6070237 +0.009242105263157895,4.6057053,0.0064,4.6069183 +0.009094736842105263,4.605692,0.006400000000000001,4.6068764 +0.009473684210526316,4.60566,0.0064,4.606909 +0.009494736842105262,4.605613,0.0064,4.606978 +0.009747368421052631,4.6056285,0.0064,4.606753 +0.009789473684210527,4.605578,0.006400000000000001,4.6068797 +0.009199999999999998,4.6055675,0.0064,4.606888 +0.009073684210526317,4.6055593,0.0064,4.606874 +0.008821052631578948,4.6055293,0.006400000000000001,4.606851 +0.009326315789473684,4.6055255,0.0064,4.606871 +0.009557894736842105,4.6055083,0.006400000000000001,4.606851 +0.009600000000000001,4.605491,0.0064,4.6068635 +0.00856842105263158,4.605466,0.0064,4.606862 +0.009894736842105263,4.605463,0.006400000000000001,4.6068873 +0.009494736842105262,4.605441,0.0064,4.6068926 +0.008673684210526314,4.6054277,0.0064,4.6068554 +0.009221052631578948,4.6054296,0.0063999999999999994,4.6068907 +0.008989473684210528,4.605404,0.0064,4.6068807 +0.00928421052631579,4.6053905,0.006400000000000001,4.6068707 +0.0092,4.6053743,0.0064,4.606894 +0.008989473684210525,4.605368,0.0064,4.606845 +0.009515789473684212,4.605355,0.0064,4.6068635 +0.009073684210526317,4.605352,0.0064,4.6068773 +0.009642105263157895,4.6053243,0.0064,4.606883 +0.009747368421052633,4.6053176,0.0064,4.6069 +0.009873684210526316,4.6053023,0.0064,4.6068873 +0.009536842105263156,4.605297,0.0064,4.6068654 +0.009515789473684212,4.6052866,0.0064,4.6068883 +0.009978947368421053,4.605265,0.006400000000000001,4.606894 +0.009957894736842107,4.605259,0.0064,4.6068826 +0.009410526315789475,4.6052504,0.0064,4.6068697 +0.01002105263157895,4.6052403,0.006400000000000001,4.6068807 +0.01002105263157895,4.6052313,0.0064,4.606872 +0.00951578947368421,4.605224,0.0064,4.6068883 +0.009852631578947368,4.605219,0.006400000000000001,4.606871 +0.009894736842105265,4.605209,0.0064,4.606871 +0.00922105263157895,4.605204,0.0064,4.6068654 +0.010042105263157896,4.605193,0.0064,4.6068764 +0.009978947368421053,4.6051874,0.006400000000000001,4.6068697 +0.009747368421052633,4.605183,0.0064,4.6068673 +0.010189473684210526,4.605178,0.0064,4.606873 +0.009789473684210527,4.605173,0.0064,4.6068773 +0.009936842105263159,4.605169,0.0064,4.606874 +0.010042105263157894,4.605166,0.0064,4.606877 +0.009494736842105262,4.6051593,0.0064,4.606874 +0.009536842105263158,4.6051593,0.0063999999999999994,4.606874 +0.010021052631578946,4.6051564,0.006400000000000001,4.6068716 +0.009747368421052631,4.605154,0.0064,4.6068726 +0.009642105263157895,4.605153,0.0064,4.606872 +0.009305263157894737,4.6051517,0.0064,4.6068726 diff --git a/VGG_38/result_outputs/test_summary.csv b/VGG_38/result_outputs/test_summary.csv new file mode 100644 index 00000000..bf44c98a --- /dev/null +++ b/VGG_38/result_outputs/test_summary.csv @@ -0,0 +1,2 @@ +test_acc,test_loss +0.01,4.608619 diff --git a/data/ccpp_data.npz b/data/ccpp_data.npz new file mode 100644 index 00000000..a507ba23 Binary files /dev/null and b/data/ccpp_data.npz differ diff --git a/data/emnist-test.npz b/data/emnist-test.npz new file mode 100644 index 00000000..05df1d80 Binary files /dev/null and b/data/emnist-test.npz differ diff --git a/data/emnist-train.npz b/data/emnist-train.npz new file mode 100644 index 00000000..177a30cd Binary files /dev/null and b/data/emnist-train.npz differ diff --git a/data/emnist-valid.npz b/data/emnist-valid.npz new file mode 100644 index 00000000..87183dd9 Binary files /dev/null and b/data/emnist-valid.npz differ diff --git a/install.sh b/install.sh new file mode 100644 index 00000000..c43e5e95 --- /dev/null +++ b/install.sh @@ -0,0 +1,2 @@ +conda install tqdm +conda install pytorch torchvision cudatoolkit=10.1 -c pytorch \ No newline at end of file diff --git a/mlp/__init__.py b/mlp/__init__.py index b41e6673..73c9478f 100644 --- a/mlp/__init__.py +++ b/mlp/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """Machine Learning Practical package.""" -__authors__ = ['Pawel Swietojanski', 'Steve Renals', 'Matt Graham'] +__authors__ = ['Pawel Swietojanski', 'Steve Renals', 'Matt Graham', 'Antreas Antoniou'] DEFAULT_SEED = 123456 # Default random number generator seed if none provided. diff --git a/mlp/data_providers.py b/mlp/data_providers.py index cd486a52..a4c72a00 100644 --- a/mlp/data_providers.py +++ b/mlp/data_providers.py @@ -7,8 +7,17 @@ import pickle import gzip +import sys + import numpy as np import os + +from PIL import Image +from torch.utils import data +from torch.utils.data import Dataset +from torchvision import transforms +from torchvision.datasets.utils import download_url, check_integrity + from mlp import DEFAULT_SEED @@ -35,23 +44,54 @@ def __init__(self, inputs, targets, batch_size, max_num_batches=-1, """ self.inputs = inputs self.targets = targets - self.batch_size = batch_size - assert max_num_batches != 0 and not max_num_batches < -1, ( - 'max_num_batches should be -1 or > 0') - self.max_num_batches = max_num_batches + if batch_size < 1: + raise ValueError('batch_size must be >= 1') + self._batch_size = batch_size + if max_num_batches == 0 or max_num_batches < -1: + raise ValueError('max_num_batches must be -1 or > 0') + self._max_num_batches = max_num_batches + self._update_num_batches() + self.shuffle_order = shuffle_order + self._current_order = np.arange(inputs.shape[0]) + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + self.new_epoch() + + @property + def batch_size(self): + """Number of data points to include in each batch.""" + return self._batch_size + + @batch_size.setter + def batch_size(self, value): + if value < 1: + raise ValueError('batch_size must be >= 1') + self._batch_size = value + self._update_num_batches() + + @property + def max_num_batches(self): + """Maximum number of batches to iterate over in an epoch.""" + return self._max_num_batches + + @max_num_batches.setter + def max_num_batches(self, value): + if value == 0 or value < -1: + raise ValueError('max_num_batches must be -1 or > 0') + self._max_num_batches = value + self._update_num_batches() + + def _update_num_batches(self): + """Updates number of batches to iterate over.""" # maximum possible number of batches is equal to number of whole times # batch_size divides in to the number of data points which can be # found using integer division - possible_num_batches = self.inputs.shape[0] // batch_size + possible_num_batches = self.inputs.shape[0] // self.batch_size if self.max_num_batches == -1: self.num_batches = possible_num_batches else: self.num_batches = min(self.max_num_batches, possible_num_batches) - self.shuffle_order = shuffle_order - if rng is None: - rng = np.random.RandomState(DEFAULT_SEED) - self.rng = rng - self.reset() def __iter__(self): """Implements Python iterator interface. @@ -63,24 +103,36 @@ def __iter__(self): """ return self - def reset(self): - """Resets the provider to the initial state to use in a new epoch.""" + def new_epoch(self): + """Starts a new epoch (pass through data), possibly shuffling first.""" self._curr_batch = 0 if self.shuffle_order: self.shuffle() + def __next__(self): + return self.next() + + def reset(self): + """Resets the provider to the initial state.""" + inv_perm = np.argsort(self._current_order) + self._current_order = self._current_order[inv_perm] + self.inputs = self.inputs[inv_perm] + self.targets = self.targets[inv_perm] + self.new_epoch() + def shuffle(self): """Randomly shuffles order of data.""" - new_order = self.rng.permutation(self.inputs.shape[0]) - self.inputs = self.inputs[new_order] - self.targets = self.targets[new_order] + perm = self.rng.permutation(self.inputs.shape[0]) + self._current_order = self._current_order[perm] + self.inputs = self.inputs[perm] + self.targets = self.targets[perm] def next(self): """Returns next data batch or raises `StopIteration` if at end.""" if self._curr_batch + 1 > self.num_batches: - # no more batches in current iteration through data set so reset - # the dataset for another pass and indicate iteration is at end - self.reset() + # no more batches in current iteration through data set so start + # new epoch ready for another pass and indicate iteration is at end + self.new_epoch() raise StopIteration() # create an index slice corresponding to current batch number batch_slice = slice(self._curr_batch * self.batch_size, @@ -90,7 +142,6 @@ def next(self): self._curr_batch += 1 return inputs_batch, targets_batch - class MNISTDataProvider(DataProvider): """Data provider for MNIST handwritten digit images.""" @@ -111,7 +162,7 @@ def __init__(self, which_set='train', batch_size=100, max_num_batches=-1, rng (RandomState): A seeded random number generator. """ # check a valid which_set was provided - assert which_set in ['train', 'valid', 'eval'], ( + assert which_set in ['train', 'valid', 'test'], ( 'Expected which_set to be either train, valid or eval. ' 'Got {0}'.format(which_set) ) @@ -133,13 +184,10 @@ def __init__(self, which_set='train', batch_size=100, max_num_batches=-1, super(MNISTDataProvider, self).__init__( inputs, targets, batch_size, max_num_batches, shuffle_order, rng) - # def next(self): - # """Returns next data batch or raises `StopIteration` if at end.""" - # inputs_batch, targets_batch = super(MNISTDataProvider, self).next() - # return inputs_batch, self.to_one_of_k(targets_batch) - # - def __next__(self): - return self.next() + def next(self): + """Returns next data batch or raises `StopIteration` if at end.""" + inputs_batch, targets_batch = super(MNISTDataProvider, self).next() + return inputs_batch, self.to_one_of_k(targets_batch) def to_one_of_k(self, int_targets): """Converts integer coded class target to 1 of K coded targets. @@ -156,15 +204,89 @@ def to_one_of_k(self, int_targets): to zero except for the column corresponding to the correct class which is equal to one. """ - raise NotImplementedError() + one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes)) + one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1 + return one_of_k_targets +class EMNISTDataProvider(DataProvider): + """Data provider for EMNIST handwritten digit images.""" + + def __init__(self, which_set='train', batch_size=100, max_num_batches=-1, + shuffle_order=True, rng=None, flatten=False): + """Create a new EMNIST data provider object. + + Args: + which_set: One of 'train', 'valid' or 'eval'. Determines which + portion of the EMNIST data this object should provide. + batch_size (int): Number of data points to include in each batch. + max_num_batches (int): Maximum number of batches to iterate over + in an epoch. If `max_num_batches * batch_size > num_data` then + only as many batches as the data can be split into will be + used. If set to -1 all of the data will be used. + shuffle_order (bool): Whether to randomly permute the order of + the data before each epoch. + rng (RandomState): A seeded random number generator. + """ + # check a valid which_set was provided + assert which_set in ['train', 'valid', 'test'], ( + 'Expected which_set to be either train, valid or eval. ' + 'Got {0}'.format(which_set) + ) + self.which_set = which_set + self.num_classes = 47 + # construct path to data using os.path.join to ensure the correct path + # separator for the current platform / OS is used + # MLP_DATA_DIR environment variable should point to the data directory + data_path = os.path.join( + os.environ['MLP_DATA_DIR'], 'emnist-{0}.npz'.format(which_set)) + assert os.path.isfile(data_path), ( + 'Data file does not exist at expected path: ' + data_path + ) + # load data from compressed numpy file + loaded = np.load(data_path) + print(loaded.keys()) + inputs, targets = loaded['inputs'], loaded['targets'] + inputs = inputs.astype(np.float32) + targets = targets.astype(np.int) + if flatten: + inputs = np.reshape(inputs, newshape=(-1, 28*28)) + else: + inputs = np.reshape(inputs, newshape=(-1, 28, 28, 1)) + inputs = inputs / 255.0 + # pass the loaded data to the parent class __init__ + super(EMNISTDataProvider, self).__init__( + inputs, targets, batch_size, max_num_batches, shuffle_order, rng) + + def next(self): + """Returns next data batch or raises `StopIteration` if at end.""" + inputs_batch, targets_batch = super(EMNISTDataProvider, self).next() + return inputs_batch, self.to_one_of_k(targets_batch) + + def to_one_of_k(self, int_targets): + """Converts integer coded class target to 1 of K coded targets. + + Args: + int_targets (ndarray): Array of integer coded class targets (i.e. + where an integer from 0 to `num_classes` - 1 is used to + indicate which is the correct class). This should be of shape + (num_data,). + + Returns: + Array of 1 of K coded targets i.e. an array of shape + (num_data, num_classes) where for each row all elements are equal + to zero except for the column corresponding to the correct class + which is equal to one. + """ + one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes)) + one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1 + return one_of_k_targets class MetOfficeDataProvider(DataProvider): """South Scotland Met Office weather data provider.""" def __init__(self, window_size, batch_size=10, max_num_batches=-1, shuffle_order=True, rng=None): - """Create a new Met Offfice data provider object. + """Create a new Met Office data provider object. Args: window_size (int): Size of windows to split weather time series @@ -180,27 +302,445 @@ def __init__(self, window_size, batch_size=10, max_num_batches=-1, the data before each epoch. rng (RandomState): A seeded random number generator. """ - self.window_size = window_size - assert window_size > 1, 'window_size must be at least 2.' data_path = os.path.join( os.environ['MLP_DATA_DIR'], 'HadSSP_daily_qc.txt') assert os.path.isfile(data_path), ( 'Data file does not exist at expected path: ' + data_path ) - # load raw data from text file - # ... + raw = np.loadtxt(data_path, skiprows=3, usecols=range(2, 32)) + assert window_size > 1, 'window_size must be at least 2.' + self.window_size = window_size # filter out all missing datapoints and flatten to a vector - # ... + filtered = raw[raw >= 0].flatten() # normalise data to zero mean, unit standard deviation - # ... - # convert from flat sequence to windowed data - # ... + mean = np.mean(filtered) + std = np.std(filtered) + normalised = (filtered - mean) / std + # create a view on to array corresponding to a rolling window + shape = (normalised.shape[-1] - self.window_size + 1, self.window_size) + strides = normalised.strides + (normalised.strides[-1],) + windowed = np.lib.stride_tricks.as_strided( + normalised, shape=shape, strides=strides) # inputs are first (window_size - 1) entries in windows - # inputs = ... + inputs = windowed[:, :-1] # targets are last entry in windows - # targets = ... - # initialise base class with inputs and targets arrays - # super(MetOfficeDataProvider, self).__init__( - # inputs, targets, batch_size, max_num_batches, shuffle_order, rng) - def __next__(self): - return self.next() \ No newline at end of file + targets = windowed[:, -1] + super(MetOfficeDataProvider, self).__init__( + inputs, targets, batch_size, max_num_batches, shuffle_order, rng) + +class CCPPDataProvider(DataProvider): + + def __init__(self, which_set='train', input_dims=None, batch_size=10, + max_num_batches=-1, shuffle_order=True, rng=None): + """Create a new Combined Cycle Power Plant data provider object. + + Args: + which_set: One of 'train' or 'valid'. Determines which portion of + data this object should provide. + input_dims: Which of the four input dimension to use. If `None` all + are used. If an iterable of integers are provided (consisting + of a subset of {0, 1, 2, 3}) then only the corresponding + input dimensions are included. + batch_size (int): Number of data points to include in each batch. + max_num_batches (int): Maximum number of batches to iterate over + in an epoch. If `max_num_batches * batch_size > num_data` then + only as many batches as the data can be split into will be + used. If set to -1 all of the data will be used. + shuffle_order (bool): Whether to randomly permute the order of + the data before each epoch. + rng (RandomState): A seeded random number generator. + """ + data_path = os.path.join( + os.environ['MLP_DATA_DIR'], 'ccpp_data.npz') + assert os.path.isfile(data_path), ( + 'Data file does not exist at expected path: ' + data_path + ) + # check a valid which_set was provided + assert which_set in ['train', 'valid'], ( + 'Expected which_set to be either train or valid ' + 'Got {0}'.format(which_set) + ) + # check input_dims are valid + if not input_dims is not None: + input_dims = set(input_dims) + assert input_dims.issubset({0, 1, 2, 3}), ( + 'input_dims should be a subset of {0, 1, 2, 3}' + ) + loaded = np.load(data_path) + inputs = loaded[which_set + '_inputs'] + if input_dims is not None: + inputs = inputs[:, input_dims] + targets = loaded[which_set + '_targets'] + super(CCPPDataProvider, self).__init__( + inputs, targets, batch_size, max_num_batches, shuffle_order, rng) + +class EMNISTPytorchDataProvider(Dataset): + def __init__(self, which_set='train', batch_size=100, max_num_batches=-1, + shuffle_order=True, rng=None, flatten=False, transforms=None): + self.numpy_data_provider = EMNISTDataProvider(which_set=which_set, batch_size=batch_size, max_num_batches=max_num_batches, + shuffle_order=shuffle_order, rng=rng, flatten=flatten) + self.transforms = transforms + + def __getitem__(self, item): + x = self.numpy_data_provider.inputs[item] + for augmentation in self.transforms: + x = augmentation(x) + return x, int(self.numpy_data_provider.targets[item]) + + def __len__(self): + return len(self.numpy_data_provider.targets) + +class AugmentedMNISTDataProvider(MNISTDataProvider): + """Data provider for MNIST dataset which randomly transforms images.""" + + def __init__(self, which_set='train', batch_size=100, max_num_batches=-1, + shuffle_order=True, rng=None, transformer=None): + """Create a new augmented MNIST data provider object. + + Args: + which_set: One of 'train', 'valid' or 'test'. Determines which + portion of the MNIST data this object should provide. + batch_size (int): Number of data points to include in each batch. + max_num_batches (int): Maximum number of batches to iterate over + in an epoch. If `max_num_batches * batch_size > num_data` then + only as many batches as the data can be split into will be + used. If set to -1 all of the data will be used. + shuffle_order (bool): Whether to randomly permute the order of + the data before each epoch. + rng (RandomState): A seeded random number generator. + transformer: Function which takes an `inputs` array of shape + (batch_size, input_dim) corresponding to a batch of input + images and a `rng` random number generator object (i.e. a + call signature `transformer(inputs, rng)`) and applies a + potentiall random set of transformations to some / all of the + input images as each new batch is returned when iterating over + the data provider. + """ + super(AugmentedMNISTDataProvider, self).__init__( + which_set, batch_size, max_num_batches, shuffle_order, rng) + self.transformer = transformer + + def next(self): + """Returns next data batch or raises `StopIteration` if at end.""" + inputs_batch, targets_batch = super( + AugmentedMNISTDataProvider, self).next() + transformed_inputs_batch = self.transformer(inputs_batch, self.rng) + return transformed_inputs_batch, targets_batch + +class Omniglot(data.Dataset): + """`CIFAR10 `_ Dataset. + Args: + root (string): Root directory of dataset where directory + ``cifar-10-batches-py`` exists or will be saved to if download is set to True. + train (bool, optional): If True, creates dataset from training set, otherwise + creates from test set. + transform (callable, optional): A function/transform that takes in an PIL image + and returns a transformed version. E.g, ``transforms.RandomCrop`` + target_transform (callable, optional): A function/transform that takes in the + target and transforms it. + download (bool, optional): If true, downloads the dataset from the internet and + puts it in root directory. If dataset is already downloaded, it is not + downloaded again. + """ + def collect_data_paths(self, root): + data_dict = dict() + print(root) + for subdir, dir, files in os.walk(root): + for file in files: + if file.endswith('.png'): + filepath = os.path.join(subdir, file) + class_label = '_'.join(subdir.split("/")[-2:]) + if class_label in data_dict: + data_dict[class_label].append(filepath) + else: + data_dict[class_label] = [filepath] + + return data_dict + + def __init__(self, root, set_name, + transform=None, target_transform=None, + download=False): + self.root = os.path.expanduser(root) + self.root = os.path.abspath(os.path.join(self.root, 'omniglot_dataset')) + self.transform = transform + self.target_transform = target_transform + self.set_name = set_name # training set or test set + self.data_dict = self.collect_data_paths(root=self.root) + + x = [] + label_to_idx = {label: idx for idx, label in enumerate(self.data_dict.keys())} + y = [] + + for key, value in self.data_dict.items(): + x.extend(value) + y.extend(len(value) * [label_to_idx[key]]) + + y = np.array(y) + + + rng = np.random.RandomState(seed=0) + + idx = np.arange(len(x)) + rng.shuffle(idx) + + x = [x[current_idx] for current_idx in idx] + y = y[idx] + + train_sample_idx = rng.choice(a=[i for i in range(len(x))], size=int(len(x) * 0.80), replace=False) + evaluation_sample_idx = [i for i in range(len(x)) if i not in train_sample_idx] + validation_sample_idx = rng.choice(a=[i for i in range(len(evaluation_sample_idx))], size=int(len(evaluation_sample_idx) * 0.40), replace=False) + test_sample_idx = [i for i in range(len(evaluation_sample_idx)) if i not in evaluation_sample_idx] + + if self.set_name is 'train': + self.data = [item for idx, item in enumerate(x) if idx in train_sample_idx] + self.labels = y[train_sample_idx] + + elif self.set_name is 'val': + self.data = [item for idx, item in enumerate(x) if idx in validation_sample_idx] + self.labels = y[validation_sample_idx] + + else: + self.data = [item for idx, item in enumerate(x) if idx in test_sample_idx] + self.labels = y[test_sample_idx] + + def __getitem__(self, index): + """ + Args: + index (int): Index + Returns: + tuple: (image, target) where target is index of the target class. + """ + img, target = self.data[index], self.labels[index] + + img = Image.open(img) + img.show() + + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + target = self.target_transform(target) + + return img, target + + def __len__(self): + return len(self.data) + + + def __repr__(self): + fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' + fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) + tmp = self.set_name + fmt_str += ' Split: {}\n'.format(tmp) + fmt_str += ' Root Location: {}\n'.format(self.root) + tmp = ' Transforms (if any): ' + fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) + tmp = ' Target Transforms (if any): ' + fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) + return fmt_str + +class CIFAR10(data.Dataset): + """`CIFAR10 `_ Dataset. + Args: + root (string): Root directory of dataset where directory + ``cifar-10-batches-py`` exists or will be saved to if download is set to True. + train (bool, optional): If True, creates dataset from training set, otherwise + creates from test set. + transform (callable, optional): A function/transform that takes in an PIL image + and returns a transformed version. E.g, ``transforms.RandomCrop`` + target_transform (callable, optional): A function/transform that takes in the + target and transforms it. + download (bool, optional): If true, downloads the dataset from the internet and + puts it in root directory. If dataset is already downloaded, it is not + downloaded again. + """ + base_folder = 'cifar-10-batches-py' + url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" + filename = "cifar-10-python.tar.gz" + tgz_md5 = 'c58f30108f718f92721af3b95e74349a' + train_list = [ + ['data_batch_1', 'c99cafc152244af753f735de768cd75f'], + ['data_batch_2', 'd4bba439e000b95fd0a9bffe97cbabec'], + ['data_batch_3', '54ebc095f3ab1f0389bbae665268c751'], + ['data_batch_4', '634d18415352ddfa80567beed471001a'], + ['data_batch_5', '482c414d41f54cd18b22e5b47cb7c3cb'], + ] + + test_list = [ + ['test_batch', '40351d587109b95175f43aff81a1287e'], + ] + + def __init__(self, root, set_name, + transform=None, target_transform=None, + download=False): + self.root = os.path.expanduser(root) + self.transform = transform + self.target_transform = target_transform + self.set_name = set_name # training set or test set + + if download: + self.download() + + if not self._check_integrity(): + raise RuntimeError('Dataset not found or corrupted.' + + ' You can use download=True to download it') + + # now load the picked numpy arrays + rng = np.random.RandomState(seed=0) + + train_sample_idx = rng.choice(a=[i for i in range(50000)], size=47500, replace=False) + val_sample_idx = [i for i in range(50000) if i not in train_sample_idx] + + if self.set_name is 'train': + self.data = [] + self.labels = [] + for fentry in self.train_list: + f = fentry[0] + file = os.path.join(self.root, self.base_folder, f) + fo = open(file, 'rb') + if sys.version_info[0] == 2: + entry = pickle.load(fo) + else: + entry = pickle.load(fo, encoding='latin1') + self.data.append(entry['data']) + if 'labels' in entry: + self.labels += entry['labels'] + else: + self.labels += entry['fine_labels'] + fo.close() + + self.data = np.concatenate(self.data) + + self.data = self.data.reshape((50000, 3, 32, 32)) + self.data = self.data.transpose((0, 2, 3, 1)) # convert to HWC + self.data = self.data[train_sample_idx] + self.labels = np.array(self.labels)[train_sample_idx] + print(set_name, self.data.shape) + print(set_name, self.labels.shape) + + elif self.set_name is 'val': + self.data = [] + self.labels = [] + for fentry in self.train_list: + f = fentry[0] + file = os.path.join(self.root, self.base_folder, f) + fo = open(file, 'rb') + if sys.version_info[0] == 2: + entry = pickle.load(fo) + else: + entry = pickle.load(fo, encoding='latin1') + self.data.append(entry['data']) + if 'labels' in entry: + self.labels += entry['labels'] + else: + self.labels += entry['fine_labels'] + fo.close() + + self.data = np.concatenate(self.data) + self.data = self.data.reshape((50000, 3, 32, 32)) + self.data = self.data.transpose((0, 2, 3, 1)) # convert to HWC + self.data = self.data[val_sample_idx] + self.labels = np.array(self.labels)[val_sample_idx] + print(set_name, self.data.shape) + print(set_name, self.labels.shape) + + else: + f = self.test_list[0][0] + file = os.path.join(self.root, self.base_folder, f) + fo = open(file, 'rb') + if sys.version_info[0] == 2: + entry = pickle.load(fo) + else: + entry = pickle.load(fo, encoding='latin1') + self.data = entry['data'] + if 'labels' in entry: + self.labels = entry['labels'] + else: + self.labels = entry['fine_labels'] + fo.close() + self.data = self.data.reshape((10000, 3, 32, 32)) + self.data = self.data.transpose((0, 2, 3, 1)) # convert to HWC + self.labels = np.array(self.labels) + print(set_name, self.data.shape) + print(set_name, self.labels.shape) + + def __getitem__(self, index): + """ + Args: + index (int): Index + Returns: + tuple: (image, target) where target is index of the target class. + """ + img, target = self.data[index], self.labels[index] + + # doing this so that it is consistent with all other datasets + # to return a PIL Image + + img = Image.fromarray(img) + + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + target = self.target_transform(target) + + return img, target + + def __len__(self): + return len(self.data) + + def _check_integrity(self): + root = self.root + for fentry in (self.train_list + self.test_list): + filename, md5 = fentry[0], fentry[1] + fpath = os.path.join(root, self.base_folder, filename) + if not check_integrity(fpath, md5): + return False + return True + + def download(self): + import tarfile + + if self._check_integrity(): + print('Files already downloaded and verified') + return + + root = self.root + download_url(self.url, root, self.filename, self.tgz_md5) + + # extract file + cwd = os.getcwd() + tar = tarfile.open(os.path.join(root, self.filename), "r:gz") + os.chdir(root) + tar.extractall() + tar.close() + os.chdir(cwd) + + def __repr__(self): + fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' + fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) + tmp = self.set_name + fmt_str += ' Split: {}\n'.format(tmp) + fmt_str += ' Root Location: {}\n'.format(self.root) + tmp = ' Transforms (if any): ' + fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) + tmp = ' Target Transforms (if any): ' + fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) + return fmt_str + + +class CIFAR100(CIFAR10): + """`CIFAR100 `_ Dataset. + This is a subclass of the `CIFAR10` Dataset. + """ + base_folder = 'cifar-100-python' + url = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz" + filename = "cifar-100-python.tar.gz" + tgz_md5 = 'eb9058c3a382ffc7106e4002c42a8d85' + train_list = [ + ['train', '16019d7e3df5f24257cddd939b257f8d'], + ] + + test_list = [ + ['test', 'f0ef6b0ae62326f3e7ffdfab6717acfc'], + ] \ No newline at end of file diff --git a/mlp/errors.py b/mlp/errors.py new file mode 100644 index 00000000..3f0ae4f7 --- /dev/null +++ b/mlp/errors.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- +"""Error functions. + +This module defines error functions, with the aim of model training being to +minimise the error function given a set of inputs and target outputs. + +The error functions will typically measure some concept of distance between the +model outputs and target outputs, averaged over all data points in the data set +or batch. +""" + +import numpy as np + + +class SumOfSquaredDiffsError(object): + """Sum of squared differences (squared Euclidean distance) error.""" + + def __call__(self, outputs, targets): + """Calculates error function given a batch of outputs and targets. + + Args: + outputs: Array of model outputs of shape (batch_size, output_dim). + targets: Array of target outputs of shape (batch_size, output_dim). + + Returns: + Scalar cost function value. + """ + return 0.5 * np.mean(np.sum((outputs - targets)**2, axis=1)) + + def grad(self, outputs, targets): + """Calculates gradient of error function with respect to outputs. + + Args: + outputs: Array of model outputs of shape (batch_size, output_dim). + targets: Array of target outputs of shape (batch_size, output_dim). + + Returns: + Gradient of error function with respect to outputs. + """ + return (outputs - targets) / outputs.shape[0] + + def __repr__(self): + return 'MeanSquaredErrorCost' + + +class BinaryCrossEntropyError(object): + """Binary cross entropy error.""" + + def __call__(self, outputs, targets): + """Calculates error function given a batch of outputs and targets. + + Args: + outputs: Array of model outputs of shape (batch_size, output_dim). + targets: Array of target outputs of shape (batch_size, output_dim). + + Returns: + Scalar error function value. + """ + return -np.mean( + targets * np.log(outputs) + (1. - targets) * np.log(1. - ouputs)) + + def grad(self, outputs, targets): + """Calculates gradient of error function with respect to outputs. + + Args: + outputs: Array of model outputs of shape (batch_size, output_dim). + targets: Array of target outputs of shape (batch_size, output_dim). + + Returns: + Gradient of error function with respect to outputs. + """ + return ((1. - targets) / (1. - outputs) - + (targets / outputs)) / outputs.shape[0] + + def __repr__(self): + return 'BinaryCrossEntropyError' + + +class BinaryCrossEntropySigmoidError(object): + """Binary cross entropy error with logistic sigmoid applied to outputs.""" + + def __call__(self, outputs, targets): + """Calculates error function given a batch of outputs and targets. + + Args: + outputs: Array of model outputs of shape (batch_size, output_dim). + targets: Array of target outputs of shape (batch_size, output_dim). + + Returns: + Scalar error function value. + """ + probs = 1. / (1. + np.exp(-outputs)) + return -np.mean( + targets * np.log(probs) + (1. - targets) * np.log(1. - probs)) + + def grad(self, outputs, targets): + """Calculates gradient of error function with respect to outputs. + + Args: + outputs: Array of model outputs of shape (batch_size, output_dim). + targets: Array of target outputs of shape (batch_size, output_dim). + + Returns: + Gradient of error function with respect to outputs. + """ + probs = 1. / (1. + np.exp(-outputs)) + return (probs - targets) / outputs.shape[0] + + def __repr__(self): + return 'BinaryCrossEntropySigmoidError' + + +class CrossEntropyError(object): + """Multi-class cross entropy error.""" + + def __call__(self, outputs, targets): + """Calculates error function given a batch of outputs and targets. + + Args: + outputs: Array of model outputs of shape (batch_size, output_dim). + targets: Array of target outputs of shape (batch_size, output_dim). + + Returns: + Scalar error function value. + """ + return -np.mean(np.sum(targets * np.log(outputs), axis=1)) + + def grad(self, outputs, targets): + """Calculates gradient of error function with respect to outputs. + + Args: + outputs: Array of model outputs of shape (batch_size, output_dim). + targets: Array of target outputs of shape (batch_size, output_dim). + + Returns: + Gradient of error function with respect to outputs. + """ + return -(targets / outputs) / outputs.shape[0] + + def __repr__(self): + return 'CrossEntropyError' + + +class CrossEntropySoftmaxError(object): + """Multi-class cross entropy error with Softmax applied to outputs.""" + + def __call__(self, outputs, targets): + """Calculates error function given a batch of outputs and targets. + + Args: + outputs: Array of model outputs of shape (batch_size, output_dim). + targets: Array of target outputs of shape (batch_size, output_dim). + + Returns: + Scalar error function value. + """ + normOutputs = outputs - outputs.max(-1)[:, None] + logProb = normOutputs - np.log(np.sum(np.exp(normOutputs), axis=-1)[:, None]) + return -np.mean(np.sum(targets * logProb, axis=1)) + + def grad(self, outputs, targets): + """Calculates gradient of error function with respect to outputs. + + Args: + outputs: Array of model outputs of shape (batch_size, output_dim). + targets: Array of target outputs of shape (batch_size, output_dim). + + Returns: + Gradient of error function with respect to outputs. + """ + probs = np.exp(outputs - outputs.max(-1)[:, None]) + probs /= probs.sum(-1)[:, None] + return (probs - targets) / outputs.shape[0] + + def __repr__(self): + return 'CrossEntropySoftmaxError' diff --git a/mlp/initialisers.py b/mlp/initialisers.py new file mode 100644 index 00000000..8c8e2526 --- /dev/null +++ b/mlp/initialisers.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- +"""Parameter initialisers. + +This module defines classes to initialise the parameters in a layer. +""" + +import numpy as np +from mlp import DEFAULT_SEED + + +class ConstantInit(object): + """Constant parameter initialiser.""" + + def __init__(self, value): + """Construct a constant parameter initialiser. + + Args: + value: Value to initialise parameter to. + """ + self.value = value + + def __call__(self, shape): + return np.ones(shape=shape) * self.value + + +class UniformInit(object): + """Random uniform parameter initialiser.""" + + def __init__(self, low, high, rng=None): + """Construct a random uniform parameter initialiser. + + Args: + low: Lower bound of interval to sample from. + high: Upper bound of interval to sample from. + rng (RandomState): Seeded random number generator. + """ + self.low = low + self.high = high + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + + def __call__(self, shape): + return self.rng.uniform(low=self.low, high=self.high, size=shape) + + +class NormalInit(object): + """Random normal parameter initialiser.""" + + def __init__(self, mean, std, rng=None): + """Construct a random uniform parameter initialiser. + + Args: + mean: Mean of distribution to sample from. + std: Standard deviation of distribution to sample from. + rng (RandomState): Seeded random number generator. + """ + self.mean = mean + self.std = std + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + + def __call__(self, shape): + return self.rng.normal(loc=self.mean, scale=self.std, size=shape) + +class GlorotUniformInit(object): + """Glorot and Bengio (2010) random uniform weights initialiser. + + Initialises an two-dimensional parameter array using the 'normalized + initialisation' scheme suggested in [1] which attempts to maintain a + roughly constant variance in the activations and backpropagated gradients + of a multi-layer model consisting of interleaved affine and logistic + sigmoidal transformation layers. + + Weights are sampled from a zero-mean uniform distribution with standard + deviation `sqrt(2 / (input_dim * output_dim))` where `input_dim` and + `output_dim` are the input and output dimensions of the weight matrix + respectively. + + References: + [1]: Understanding the difficulty of training deep feedforward neural + networks, Glorot and Bengio (2010) + """ + + def __init__(self, gain=1., rng=None): + """Construct a normalised initilisation random initialiser object. + + Args: + gain: Multiplicative factor to scale initialised weights by. + Recommended values is 1 for affine layers followed by + logistic sigmoid layers (or another affine layer). + rng (RandomState): Seeded random number generator. + """ + self.gain = gain + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + + def __call__(self, shape): + assert len(shape) == 2, ( + 'Initialiser should only be used for two dimensional arrays.') + std = self.gain * (2. / (shape[0] + shape[1]))**0.5 + half_width = 3.**0.5 * std + return self.rng.uniform(low=-half_width, high=half_width, size=shape) + + +class GlorotNormalInit(object): + """Glorot and Bengio (2010) random normal weights initialiser. + + Initialises an two-dimensional parameter array using the 'normalized + initialisation' scheme suggested in [1] which attempts to maintain a + roughly constant variance in the activations and backpropagated gradients + of a multi-layer model consisting of interleaved affine and logistic + sigmoidal transformation layers. + + Weights are sampled from a zero-mean normal distribution with standard + deviation `sqrt(2 / (input_dim * output_dim))` where `input_dim` and + `output_dim` are the input and output dimensions of the weight matrix + respectively. + + References: + [1]: Understanding the difficulty of training deep feedforward neural + networks, Glorot and Bengio (2010) + """ + + def __init__(self, gain=1., rng=None): + """Construct a normalised initilisation random initialiser object. + + Args: + gain: Multiplicative factor to scale initialised weights by. + Recommended values is 1 for affine layers followed by + logistic sigmoid layers (or another affine layer). + rng (RandomState): Seeded random number generator. + """ + self.gain = gain + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + + def __call__(self, shape): + std = self.gain * (2. / (shape[0] + shape[1]))**0.5 + return self.rng.normal(loc=0., scale=std, size=shape) diff --git a/mlp/layers.py b/mlp/layers.py new file mode 100644 index 00000000..c6641c9a --- /dev/null +++ b/mlp/layers.py @@ -0,0 +1,824 @@ +# -*- coding: utf-8 -*- +"""Layer definitions. + +This module defines classes which encapsulate a single layer. + +These layers map input activations to output activation with the `fprop` +method and map gradients with repsect to outputs to gradients with respect to +their inputs with the `bprop` method. + +Some layers will have learnable parameters and so will additionally define +methods for getting and setting parameter and calculating gradients with +respect to the layer parameters. +""" + +import numpy as np +import mlp.initialisers as init +from mlp import DEFAULT_SEED + + +class Layer(object): + """Abstract class defining the interface for a layer.""" + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + raise NotImplementedError() + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + raise NotImplementedError() + + +class LayerWithParameters(Layer): + """Abstract class defining the interface for a layer with parameters.""" + + def grads_wrt_params(self, inputs, grads_wrt_outputs): + """Calculates gradients with respect to layer parameters. + + Args: + inputs: Array of inputs to layer of shape (batch_size, input_dim). + grads_wrt_to_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + List of arrays of gradients with respect to the layer parameters + with parameter gradients appearing in same order in tuple as + returned from `get_params` method. + """ + raise NotImplementedError() + + def params_penalty(self): + """Returns the parameter dependent penalty term for this layer. + + If no parameter-dependent penalty terms are set this returns zero. + """ + raise NotImplementedError() + + @property + def params(self): + """Returns a list of parameters of layer. + + Returns: + List of current parameter values. This list should be in the + corresponding order to the `values` argument to `set_params`. + """ + raise NotImplementedError() + + @params.setter + def params(self, values): + """Sets layer parameters from a list of values. + + Args: + values: List of values to set parameters to. This list should be + in the corresponding order to what is returned by `get_params`. + """ + raise NotImplementedError() + + +class StochasticLayerWithParameters(Layer): + """Specialised layer which uses a stochastic forward propagation.""" + + def __init__(self, rng=None): + """Constructs a new StochasticLayer object. + + Args: + rng (RandomState): Seeded random number generator object. + """ + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + + def fprop(self, inputs, stochastic=True): + """Forward propagates activations through the layer transformation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + stochastic: Flag allowing different deterministic + forward-propagation mode in addition to default stochastic + forward-propagation e.g. for use at test time. If False + a deterministic forward-propagation transformation + corresponding to the expected output of the stochastic + forward-propagation is applied. + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + raise NotImplementedError() + + def grads_wrt_params(self, inputs, grads_wrt_outputs): + """Calculates gradients with respect to layer parameters. + + Args: + inputs: Array of inputs to layer of shape (batch_size, input_dim). + grads_wrt_to_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + List of arrays of gradients with respect to the layer parameters + with parameter gradients appearing in same order in tuple as + returned from `get_params` method. + """ + raise NotImplementedError() + + def params_penalty(self): + """Returns the parameter dependent penalty term for this layer. + + If no parameter-dependent penalty terms are set this returns zero. + """ + raise NotImplementedError() + + @property + def params(self): + """Returns a list of parameters of layer. + + Returns: + List of current parameter values. This list should be in the + corresponding order to the `values` argument to `set_params`. + """ + raise NotImplementedError() + + @params.setter + def params(self, values): + """Sets layer parameters from a list of values. + + Args: + values: List of values to set parameters to. This list should be + in the corresponding order to what is returned by `get_params`. + """ + raise NotImplementedError() + + +class StochasticLayer(Layer): + """Specialised layer which uses a stochastic forward propagation.""" + + def __init__(self, rng=None): + """Constructs a new StochasticLayer object. + + Args: + rng (RandomState): Seeded random number generator object. + """ + if rng is None: + rng = np.random.RandomState(DEFAULT_SEED) + self.rng = rng + + def fprop(self, inputs, stochastic=True): + """Forward propagates activations through the layer transformation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + stochastic: Flag allowing different deterministic + forward-propagation mode in addition to default stochastic + forward-propagation e.g. for use at test time. If False + a deterministic forward-propagation transformation + corresponding to the expected output of the stochastic + forward-propagation is applied. + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + raise NotImplementedError() + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. This should correspond to + default stochastic forward-propagation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + raise NotImplementedError() + + +class AffineLayer(LayerWithParameters): + """Layer implementing an affine tranformation of its inputs. + + This layer is parameterised by a weight matrix and bias vector. + """ + + def __init__(self, input_dim, output_dim, + weights_initialiser=init.UniformInit(-0.1, 0.1), + biases_initialiser=init.ConstantInit(0.), + weights_penalty=None, biases_penalty=None): + """Initialises a parameterised affine layer. + + Args: + input_dim (int): Dimension of inputs to the layer. + output_dim (int): Dimension of the layer outputs. + weights_initialiser: Initialiser for the weight parameters. + biases_initialiser: Initialiser for the bias parameters. + weights_penalty: Weights-dependent penalty term (regulariser) or + None if no regularisation is to be applied to the weights. + biases_penalty: Biases-dependent penalty term (regulariser) or + None if no regularisation is to be applied to the biases. + """ + self.input_dim = input_dim + self.output_dim = output_dim + self.weights = weights_initialiser((self.output_dim, self.input_dim)) + self.biases = biases_initialiser(self.output_dim) + self.weights_penalty = weights_penalty + self.biases_penalty = biases_penalty + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + For inputs `x`, outputs `y`, weights `W` and biases `b` the layer + corresponds to `y = W.dot(x) + b`. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + return self.weights.dot(inputs.T).T + self.biases + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + return grads_wrt_outputs.dot(self.weights) + + def grads_wrt_params(self, inputs, grads_wrt_outputs): + """Calculates gradients with respect to layer parameters. + + Args: + inputs: array of inputs to layer of shape (batch_size, input_dim) + grads_wrt_to_outputs: array of gradients with respect to the layer + outputs of shape (batch_size, output_dim) + + Returns: + list of arrays of gradients with respect to the layer parameters + `[grads_wrt_weights, grads_wrt_biases]`. + """ + + grads_wrt_weights = np.dot(grads_wrt_outputs.T, inputs) + grads_wrt_biases = np.sum(grads_wrt_outputs, axis=0) + + if self.weights_penalty is not None: + grads_wrt_weights += self.weights_penalty.grad(parameter=self.weights) + + if self.biases_penalty is not None: + grads_wrt_biases += self.biases_penalty.grad(parameter=self.biases) + + return [grads_wrt_weights, grads_wrt_biases] + + def params_penalty(self): + """Returns the parameter dependent penalty term for this layer. + + If no parameter-dependent penalty terms are set this returns zero. + """ + params_penalty = 0 + if self.weights_penalty is not None: + params_penalty += self.weights_penalty(self.weights) + if self.biases_penalty is not None: + params_penalty += self.biases_penalty(self.biases) + return params_penalty + + @property + def params(self): + """A list of layer parameter values: `[weights, biases]`.""" + return [self.weights, self.biases] + + @params.setter + def params(self, values): + self.weights = values[0] + self.biases = values[1] + + def __repr__(self): + return 'AffineLayer(input_dim={0}, output_dim={1})'.format( + self.input_dim, self.output_dim) + + +class SigmoidLayer(Layer): + """Layer implementing an element-wise logistic sigmoid transformation.""" + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + For inputs `x` and outputs `y` this corresponds to + `y = 1 / (1 + exp(-x))`. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + return 1. / (1. + np.exp(-inputs)) + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + return grads_wrt_outputs * outputs * (1. - outputs) + + def __repr__(self): + return 'SigmoidLayer' + + +class ConvolutionalLayer(LayerWithParameters): + """Layer implementing a 2D convolution-based transformation of its inputs. + The layer is parameterised by a set of 2D convolutional kernels, a four + dimensional array of shape + (num_output_channels, num_input_channels, kernel_height, kernel_dim_2) + and a bias vector, a one dimensional array of shape + (num_output_channels,) + i.e. one shared bias per output channel. + Assuming no-padding is applied to the inputs so that outputs are only + calculated for positions where the kernel filters fully overlap with the + inputs, and that unit strides are used the outputs will have spatial extent + output_height = input_height - kernel_height + 1 + output_width = input_width - kernel_width + 1 + """ + + def __init__(self, num_input_channels, num_output_channels, + input_height, input_width, + kernel_height, kernel_width, + kernels_init=init.UniformInit(-0.01, 0.01), + biases_init=init.ConstantInit(0.), + kernels_penalty=None, biases_penalty=None): + """Initialises a parameterised convolutional layer. + Args: + num_input_channels (int): Number of channels in inputs to + layer (this may be number of colour channels in the input + images if used as the first layer in a model, or the + number of output channels, a.k.a. feature maps, from a + a previous convolutional layer). + num_output_channels (int): Number of channels in outputs + from the layer, a.k.a. number of feature maps. + input_height (int): Size of first input dimension of each 2D + channel of inputs. + input_width (int): Size of second input dimension of each 2D + channel of inputs. + kernel_height (int): Size of first dimension of each 2D channel of + kernels. + kernel_width (int): Size of second dimension of each 2D channel of + kernels. + kernels_intialiser: Initialiser for the kernel parameters. + biases_initialiser: Initialiser for the bias parameters. + kernels_penalty: Kernel-dependent penalty term (regulariser) or + None if no regularisation is to be applied to the kernels. + biases_penalty: Biases-dependent penalty term (regulariser) or + None if no regularisation is to be applied to the biases. + """ + self.num_input_channels = num_input_channels + self.num_output_channels = num_output_channels + self.input_height = input_height + self.input_width = input_width + self.kernel_height = kernel_height + self.kernel_width = kernel_width + self.kernels_init = kernels_init + self.biases_init = biases_init + self.kernels_shape = ( + num_output_channels, num_input_channels, kernel_height, kernel_width + ) + self.inputs_shape = ( + None, num_input_channels, input_height, input_width + ) + self.kernels = self.kernels_init(self.kernels_shape) + self.biases = self.biases_init(num_output_channels) + self.kernels_penalty = kernels_penalty + self.biases_penalty = biases_penalty + + self.cache = None + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + For inputs `x`, outputs `y`, kernels `K` and biases `b` the layer + corresponds to `y = conv2d(x, K) + b`. + Args: + inputs: Array of layer inputs of shape (batch_size, num_input_channels, image_height, image_width). + Returns: + outputs: Array of layer outputs of shape (batch_size, num_output_channels, output_height, output_width). + """ + raise NotImplementedError + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + Args: + inputs: Array of layer inputs of shape + (batch_size, num_input_channels, input_height, input_width). + outputs: Array of layer outputs calculated in forward pass of + shape + (batch_size, num_output_channels, output_height, output_width). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape + (batch_size, num_output_channels, output_height, output_width). + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, num_input_channels, input_height, input_width). + """ + # Pad the grads_wrt_outputs + raise NotImplementedError + + def grads_wrt_params(self, inputs, grads_wrt_outputs): + """Calculates gradients with respect to layer parameters. + Args: + inputs: array of inputs to layer of shape (batch_size, input_dim) + grads_wrt_to_outputs: array of gradients with respect to the layer + outputs of shape + (batch_size, num_output_channels, output_height, output_width). + Returns: + list of arrays of gradients with respect to the layer parameters + `[grads_wrt_kernels, grads_wrt_biases]`. + """ + # Get inputs_col from previous fprop + raise NotImplementedError + + def params_penalty(self): + """Returns the parameter dependent penalty term for this layer. + If no parameter-dependent penalty terms are set this returns zero. + """ + params_penalty = 0 + if self.kernels_penalty is not None: + params_penalty += self.kernels_penalty(self.kernels) + if self.biases_penalty is not None: + params_penalty += self.biases_penalty(self.biases) + return params_penalty + + @property + def params(self): + """A list of layer parameter values: `[kernels, biases]`.""" + return [self.kernels, self.biases] + + @params.setter + def params(self, values): + self.kernels = values[0] + self.biases = values[1] + + def __repr__(self): + return ( + 'ConvolutionalLayer(\n' + ' num_input_channels={0}, num_output_channels={1},\n' + ' input_height={2}, input_width={3},\n' + ' kernel_height={4}, kernel_width={5}\n' + ')' + .format(self.num_input_channels, self.num_output_channels, + self.input_height, self.input_width, self.kernel_height, + self.kernel_width) + ) + + +class ReluLayer(Layer): + """Layer implementing an element-wise rectified linear transformation.""" + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + For inputs `x` and outputs `y` this corresponds to `y = max(0, x)`. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + return np.maximum(inputs, 0.) + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + return (outputs > 0) * grads_wrt_outputs + + def __repr__(self): + return 'ReluLayer' + + +class TanhLayer(Layer): + """Layer implementing an element-wise hyperbolic tangent transformation.""" + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + For inputs `x` and outputs `y` this corresponds to `y = tanh(x)`. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + return np.tanh(inputs) + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + return (1. - outputs ** 2) * grads_wrt_outputs + + def __repr__(self): + return 'TanhLayer' + + +class SoftmaxLayer(Layer): + """Layer implementing a softmax transformation.""" + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + For inputs `x` and outputs `y` this corresponds to + + `y = exp(x) / sum(exp(x))`. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + # subtract max inside exponential to improve numerical stability - + # when we divide through by sum this term cancels + exp_inputs = np.exp(inputs - inputs.max(-1)[:, None]) + return exp_inputs / exp_inputs.sum(-1)[:, None] + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + return (outputs * (grads_wrt_outputs - + (grads_wrt_outputs * outputs).sum(-1)[:, None])) + + def __repr__(self): + return 'SoftmaxLayer' + + +class RadialBasisFunctionLayer(Layer): + """Layer implementing projection to a grid of radial basis functions.""" + + def __init__(self, grid_dim, intervals=[[0., 1.]]): + """Creates a radial basis function layer object. + + Args: + grid_dim: Integer specifying how many basis function to use in + grid across input space per dimension (so total number of + basis functions will be grid_dim**input_dim) + intervals: List of intervals (two element lists or tuples) + specifying extents of axis-aligned region in input-space to + tile basis functions in grid across. For example for a 2D input + space spanning [0, 1] x [0, 1] use intervals=[[0, 1], [0, 1]]. + """ + num_basis = grid_dim ** len(intervals) + self.centres = np.array(np.meshgrid(*[ + np.linspace(low, high, grid_dim) for (low, high) in intervals]) + ).reshape((len(intervals), -1)) + self.scales = np.array([ + [(high - low) * 1. / grid_dim] for (low, high) in intervals]) + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + return np.exp(-(inputs[..., None] - self.centres[None, ...]) ** 2 / + self.scales ** 2).reshape((inputs.shape[0], -1)) + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + num_basis = self.centres.shape[1] + return -2 * ( + ((inputs[..., None] - self.centres[None, ...]) / self.scales ** 2) * + grads_wrt_outputs.reshape((inputs.shape[0], -1, num_basis)) + ).sum(-1) + + def __repr__(self): + return 'RadialBasisFunctionLayer(grid_dim={0})'.format(self.grid_dim) + + +class DropoutLayer(StochasticLayer): + """Layer which stochastically drops input dimensions in its output.""" + + def __init__(self, rng=None, incl_prob=0.5, share_across_batch=True): + """Construct a new dropout layer. + + Args: + rng (RandomState): Seeded random number generator. + incl_prob: Scalar value in (0, 1] specifying the probability of + each input dimension being included in the output. + share_across_batch: Whether to use same dropout mask across + all inputs in a batch or use per input masks. + """ + super(DropoutLayer, self).__init__(rng) + assert incl_prob > 0. and incl_prob <= 1. + self.incl_prob = incl_prob + self.share_across_batch = share_across_batch + self.rng = rng + + def fprop(self, inputs, stochastic=True): + """Forward propagates activations through the layer transformation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + stochastic: Flag allowing different deterministic + forward-propagation mode in addition to default stochastic + forward-propagation e.g. for use at test time. If False + a deterministic forward-propagation transformation + corresponding to the expected output of the stochastic + forward-propagation is applied. + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + if stochastic: + mask_shape = (1,) + inputs.shape[1:] if self.share_across_batch else inputs.shape + self._mask = (self.rng.uniform(size=mask_shape) < self.incl_prob) + return inputs * self._mask + else: + return inputs * self.incl_prob + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. This should correspond to + default stochastic forward-propagation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + return grads_wrt_outputs * self._mask + + def __repr__(self): + return 'DropoutLayer(incl_prob={0:.1f})'.format(self.incl_prob) + + +class ReshapeLayer(Layer): + """Layer which reshapes dimensions of inputs.""" + + def __init__(self, output_shape=None): + """Create a new reshape layer object. + + Args: + output_shape: Tuple specifying shape each input in batch should + be reshaped to in outputs. This **excludes** the batch size + so the shape of the final output array will be + (batch_size, ) + output_shape + Similarly to numpy.reshape, one shape dimension can be -1. In + this case, the value is inferred from the size of the input + array and remaining dimensions. The shape specified must be + compatible with the input array shape - i.e. the total number + of values in the array cannot be changed. If set to `None` the + output shape will be set to + (batch_size, -1) + which will flatten all the inputs to vectors. + """ + self.output_shape = (-1,) if output_shape is None else output_shape + + def fprop(self, inputs): + """Forward propagates activations through the layer transformation. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + + Returns: + outputs: Array of layer outputs of shape (batch_size, output_dim). + """ + return inputs.reshape((inputs.shape[0],) + self.output_shape) + + def bprop(self, inputs, outputs, grads_wrt_outputs): + """Back propagates gradients through a layer. + + Given gradients with respect to the outputs of the layer calculates the + gradients with respect to the layer inputs. + + Args: + inputs: Array of layer inputs of shape (batch_size, input_dim). + outputs: Array of layer outputs calculated in forward pass of + shape (batch_size, output_dim). + grads_wrt_outputs: Array of gradients with respect to the layer + outputs of shape (batch_size, output_dim). + + Returns: + Array of gradients with respect to the layer inputs of shape + (batch_size, input_dim). + """ + return grads_wrt_outputs.reshape(inputs.shape) + + def __repr__(self): + return 'ReshapeLayer(output_shape={0})'.format(self.output_shape) \ No newline at end of file diff --git a/mlp/learning_rules.py b/mlp/learning_rules.py new file mode 100644 index 00000000..52f34ccd --- /dev/null +++ b/mlp/learning_rules.py @@ -0,0 +1,388 @@ +# -*- coding: utf-8 -*- +"""Learning rules. + +This module contains classes implementing gradient based learning rules. +""" + +import numpy as np + + +class GradientDescentLearningRule(object): + """Simple (stochastic) gradient descent learning rule. + + For a scalar error function `E(p[0], p_[1] ... )` of some set of + potentially multidimensional parameters this attempts to find a local + minimum of the loss function by applying updates to each parameter of the + form + + p[i] := p[i] - learning_rate * dE/dp[i] + + With `learning_rate` a positive scaling parameter. + + The error function used in successive applications of these updates may be + a stochastic estimator of the true error function (e.g. when the error with + respect to only a subset of data-points is calculated) in which case this + will correspond to a stochastic gradient descent learning rule. + """ + + def __init__(self, learning_rate=1e-3): + """Creates a new learning rule object. + + Args: + learning_rate: A postive scalar to scale gradient updates to the + parameters by. This needs to be carefully set - if too large + the learning dynamic will be unstable and may diverge, while + if set too small learning will proceed very slowly. + + """ + assert learning_rate > 0., 'learning_rate should be positive.' + self.learning_rate = learning_rate + + def initialise(self, params): + """Initialises the state of the learning rule for a set or parameters. + + This must be called before `update_params` is first called. + + Args: + params: A list of the parameters to be optimised. Note these will + be updated *in-place* to avoid reallocating arrays on each + update. + """ + self.params = params + + def reset(self): + """Resets any additional state variables to their intial values. + + For this learning rule there are no additional state variables so we + do nothing here. + """ + pass + + def update_params(self, grads_wrt_params): + """Applies a single gradient descent update to all parameters. + + All parameter updates are performed using in-place operations and so + nothing is returned. + + Args: + grads_wrt_params: A list of gradients of the scalar loss function + with respect to each of the parameters passed to `initialise` + previously, with this list expected to be in the same order. + """ + for param, grad in zip(self.params, grads_wrt_params): + param -= self.learning_rate * grad + + +class MomentumLearningRule(GradientDescentLearningRule): + """Gradient descent with momentum learning rule. + + This extends the basic gradient learning rule by introducing extra + momentum state variables for each parameter. These can help the learning + dynamic help overcome shallow local minima and speed convergence when + making multiple successive steps in a similar direction in parameter space. + + For parameter p[i] and corresponding momentum m[i] the updates for a + scalar loss function `L` are of the form + + m[i] := mom_coeff * m[i] - learning_rate * dL/dp[i] + p[i] := p[i] + m[i] + + with `learning_rate` a positive scaling parameter for the gradient updates + and `mom_coeff` a value in [0, 1] that determines how much 'friction' there + is the system and so how quickly previous momentum contributions decay. + """ + + def __init__(self, learning_rate=1e-3, mom_coeff=0.9): + """Creates a new learning rule object. + + Args: + learning_rate: A postive scalar to scale gradient updates to the + parameters by. This needs to be carefully set - if too large + the learning dynamic will be unstable and may diverge, while + if set too small learning will proceed very slowly. + mom_coeff: A scalar in the range [0, 1] inclusive. This determines + the contribution of the previous momentum value to the value + after each update. If equal to 0 the momentum is set to exactly + the negative scaled gradient each update and so this rule + collapses to standard gradient descent. If equal to 1 the + momentum will just be decremented by the scaled gradient at + each update. This is equivalent to simulating the dynamic in + a frictionless system. Due to energy conservation the loss + of 'potential energy' as the dynamics moves down the loss + function surface will lead to an increasingly large 'kinetic + energy' and so speed, meaning the updates will become + increasingly large, potentially unstably so. Typically a value + less than but close to 1 will avoid these issues and cause the + dynamic to converge to a local minima where the gradients are + by definition zero. + """ + super(MomentumLearningRule, self).__init__(learning_rate) + assert mom_coeff >= 0. and mom_coeff <= 1., ( + 'mom_coeff should be in the range [0, 1].' + ) + self.mom_coeff = mom_coeff + + def initialise(self, params): + """Initialises the state of the learning rule for a set or parameters. + + This must be called before `update_params` is first called. + + Args: + params: A list of the parameters to be optimised. Note these will + be updated *in-place* to avoid reallocating arrays on each + update. + """ + super(MomentumLearningRule, self).initialise(params) + self.moms = [] + for param in self.params: + self.moms.append(np.zeros_like(param)) + + def reset(self): + """Resets any additional state variables to their intial values. + + For this learning rule this corresponds to zeroing all the momenta. + """ + for mom in zip(self.moms): + mom *= 0. + + def update_params(self, grads_wrt_params): + """Applies a single update to all parameters. + + All parameter updates are performed using in-place operations and so + nothing is returned. + + Args: + grads_wrt_params: A list of gradients of the scalar loss function + with respect to each of the parameters passed to `initialise` + previously, with this list expected to be in the same order. + """ + for param, mom, grad in zip(self.params, self.moms, grads_wrt_params): + mom *= self.mom_coeff + mom -= self.learning_rate * grad + param += mom + + +class AdamLearningRule(GradientDescentLearningRule): + """Adaptive moments (Adam) learning rule. + First-order gradient-descent based learning rule which uses adaptive + estimates of first and second moments of the parameter gradients to + calculate the parameter updates. + References: + [1]: Adam: a method for stochastic optimisation + Kingma and Ba, 2015 + """ + + def __init__(self, learning_rate=1e-3, beta_1=0.9, beta_2=0.999, + epsilon=1e-8): + """Creates a new learning rule object. + Args: + learning_rate: A postive scalar to scale gradient updates to the + parameters by. This needs to be carefully set - if too large + the learning dynamic will be unstable and may diverge, while + if set too small learning will proceed very slowly. + beta_1: Exponential decay rate for gradient first moment estimates. + This should be a scalar value in [0, 1]. The running gradient + first moment estimate is calculated using + `m_1 = beta_1 * m_1_prev + (1 - beta_1) * g` + where `m_1_prev` is the previous estimate and `g` the current + parameter gradients. + beta_2: Exponential decay rate for gradient second moment + estimates. This should be a scalar value in [0, 1]. The run + gradient second moment estimate is calculated using + `m_2 = beta_2 * m_2_prev + (1 - beta_2) * g**2` + where `m_2_prev` is the previous estimate and `g` the current + parameter gradients. + epsilon: 'Softening' parameter to stop updates diverging when + second moment estimates are close to zero. Should be set to + a small positive value. + """ + super(AdamLearningRule, self).__init__(learning_rate) + assert beta_1 >= 0. and beta_1 <= 1., 'beta_1 should be in [0, 1].' + assert beta_2 >= 0. and beta_2 <= 1., 'beta_2 should be in [0, 2].' + assert epsilon > 0., 'epsilon should be > 0.' + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + + def initialise(self, params): + """Initialises the state of the learning rule for a set or parameters. + This must be called before `update_params` is first called. + Args: + params: A list of the parameters to be optimised. Note these will + be updated *in-place* to avoid reallocating arrays on each + update. + """ + super(AdamLearningRule, self).initialise(params) + self.moms_1 = [] + for param in self.params: + self.moms_1.append(np.zeros_like(param)) + self.moms_2 = [] + for param in self.params: + self.moms_2.append(np.zeros_like(param)) + self.step_count = 0 + + def reset(self): + """Resets any additional state variables to their initial values. + For this learning rule this corresponds to zeroing the estimates of + the first and second moments of the gradients. + """ + for mom_1, mom_2 in zip(self.moms_1, self.moms_2): + mom_1 *= 0. + mom_2 *= 0. + self.step_count = 0 + + def update_params(self, grads_wrt_params): + """Applies a single update to all parameters. + All parameter updates are performed using in-place operations and so + nothing is returned. + Args: + grads_wrt_params: A list of gradients of the scalar loss function + with respect to each of the parameters passed to `initialise` + previously, with this list expected to be in the same order. + """ + for param, mom_1, mom_2, grad in zip( + self.params, self.moms_1, self.moms_2, grads_wrt_params): + mom_1 *= self.beta_1 + mom_1 += (1. - self.beta_1) * grad + mom_2 *= self.beta_2 + mom_2 += (1. - self.beta_2) * grad ** 2 + alpha_t = ( + self.learning_rate * + (1. - self.beta_2 ** (self.step_count + 1)) ** 0.5 / + (1. - self.beta_1 ** (self.step_count + 1)) + ) + param -= alpha_t * mom_1 / (mom_2 ** 0.5 + self.epsilon) + self.step_count += 1 + + +class AdaGradLearningRule(GradientDescentLearningRule): + """Adaptive gradients (AdaGrad) learning rule. + First-order gradient-descent based learning rule which normalises gradient + updates by a running sum of the past squared gradients. + References: + [1]: Adaptive Subgradient Methods for Online Learning and Stochastic + Optimization. Duchi, Haxan and Singer, 2011 + """ + + def __init__(self, learning_rate=1e-2, epsilon=1e-8): + """Creates a new learning rule object. + Args: + learning_rate: A postive scalar to scale gradient updates to the + parameters by. This needs to be carefully set - if too large + the learning dynamic will be unstable and may diverge, while + if set too small learning will proceed very slowly. + epsilon: 'Softening' parameter to stop updates diverging when + sums of squared gradients are close to zero. Should be set to + a small positive value. + """ + super(AdaGradLearningRule, self).__init__(learning_rate) + assert epsilon > 0., 'epsilon should be > 0.' + self.epsilon = epsilon + + def initialise(self, params): + """Initialises the state of the learning rule for a set or parameters. + This must be called before `update_params` is first called. + Args: + params: A list of the parameters to be optimised. Note these will + be updated *in-place* to avoid reallocating arrays on each + update. + """ + super(AdaGradLearningRule, self).initialise(params) + self.sum_sq_grads = [] + for param in self.params: + self.sum_sq_grads.append(np.zeros_like(param)) + + def reset(self): + """Resets any additional state variables to their initial values. + For this learning rule this corresponds to zeroing all the sum of + squared gradient states. + """ + for sum_sq_grad in self.sum_sq_grads: + sum_sq_grad *= 0. + + def update_params(self, grads_wrt_params): + """Applies a single update to all parameters. + All parameter updates are performed using in-place operations and so + nothing is returned. + Args: + grads_wrt_params: A list of gradients of the scalar loss function + with respect to each of the parameters passed to `initialise` + previously, with this list expected to be in the same order. + """ + for param, sum_sq_grad, grad in zip( + self.params, self.sum_sq_grads, grads_wrt_params): + sum_sq_grad += grad ** 2 + param -= (self.learning_rate * grad / + (sum_sq_grad + self.epsilon) ** 0.5) + + +class RMSPropLearningRule(GradientDescentLearningRule): + """Root mean squared gradient normalised learning rule (RMSProp). + First-order gradient-descent based learning rule which normalises gradient + updates by a exponentially smoothed estimate of the gradient second + moments. + References: + [1]: Neural Networks for Machine Learning: Lecture 6a slides + University of Toronto,Computer Science Course CSC321 + http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf + """ + + def __init__(self, learning_rate=1e-3, beta=0.9, epsilon=1e-8): + """Creates a new learning rule object. + Args: + learning_rate: A postive scalar to scale gradient updates to the + parameters by. This needs to be carefully set - if too large + the learning dynamic will be unstable and may diverge, while + if set too small learning will proceed very slowly. + beta: Exponential decay rate for gradient second moment + estimates. This should be a scalar value in [0, 1]. The running + gradient second moment estimate is calculated using + `m_2 = beta * m_2_prev + (1 - beta) * g**2` + where `m_2_prev` is the previous estimate and `g` the current + parameter gradients. + epsilon: 'Softening' parameter to stop updates diverging when + gradient second moment estimates are close to zero. Should be + set to a small positive value. + """ + super(RMSPropLearningRule, self).__init__(learning_rate) + assert beta >= 0. and beta <= 1., 'beta should be in [0, 1].' + assert epsilon > 0., 'epsilon should be > 0.' + self.beta = beta + self.epsilon = epsilon + + def initialise(self, params): + """Initialises the state of the learning rule for a set or parameters. + This must be called before `update_params` is first called. + Args: + params: A list of the parameters to be optimised. Note these will + be updated *in-place* to avoid reallocating arrays on each + update. + """ + super(RMSPropLearningRule, self).initialise(params) + self.moms_2 = [] + for param in self.params: + self.moms_2.append(np.zeros_like(param)) + + def reset(self): + """Resets any additional state variables to their initial values. + For this learning rule this corresponds to zeroing all gradient + second moment estimates. + """ + for mom_2 in self.moms_2: + mom_2 *= 0. + + def update_params(self, grads_wrt_params): + """Applies a single update to all parameters. + All parameter updates are performed using in-place operations and so + nothing is returned. + Args: + grads_wrt_params: A list of gradients of the scalar loss function + with respect to each of the parameters passed to `initialise` + previously, with this list expected to be in the same order. + """ + for param, mom_2, grad in zip( + self.params, self.moms_2, grads_wrt_params): + mom_2 *= self.beta + mom_2 += (1. - self.beta) * grad ** 2 + param -= (self.learning_rate * grad / + (mom_2 + self.epsilon) ** 0.5) diff --git a/mlp/models.py b/mlp/models.py new file mode 100644 index 00000000..b292cf4c --- /dev/null +++ b/mlp/models.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- +"""Model definitions. + +This module implements objects encapsulating learnable models of input-output +relationships. The model objects implement methods for forward propagating +the inputs through the transformation(s) defined by the model to produce +outputs (and intermediate states) and for calculating gradients of scalar +functions of the outputs with respect to the model parameters. +""" + +from mlp.layers import LayerWithParameters, StochasticLayer, StochasticLayerWithParameters + + +class SingleLayerModel(object): + """A model consisting of a single transformation layer.""" + + def __init__(self, layer): + """Create a new single layer model instance. + + Args: + layer: The layer object defining the model architecture. + """ + self.layer = layer + + @property + def params(self): + """A list of all of the parameters of the model.""" + return self.layer.params + + def fprop(self, inputs, evaluation=False): + """Calculate the model outputs corresponding to a batch of inputs. + + Args: + inputs: Batch of inputs to the model. + + Returns: + List which is a concatenation of the model inputs and model + outputs, this being done for consistency of the interface with + multi-layer models for which `fprop` returns a list of + activations through all immediate layers of the model and including + the inputs and outputs. + """ + activations = [inputs, self.layer.fprop(inputs)] + return activations + + def grads_wrt_params(self, activations, grads_wrt_outputs): + """Calculates gradients with respect to the model parameters. + + Args: + activations: List of all activations from forward pass through + model using `fprop`. + grads_wrt_outputs: Gradient with respect to the model outputs of + the scalar function parameter gradients are being calculated + for. + + Returns: + List of gradients of the scalar function with respect to all model + parameters. + """ + return self.layer.grads_wrt_params(activations[0], grads_wrt_outputs) + + def __repr__(self): + return 'SingleLayerModel(' + str(self.layer) + ')' + + +class MultipleLayerModel(object): + """A model consisting of multiple layers applied sequentially.""" + + def __init__(self, layers): + """Create a new multiple layer model instance. + + Args: + layers: List of the the layer objecst defining the model in the + order they should be applied from inputs to outputs. + """ + self.layers = layers + + @property + def params(self): + """A list of all of the parameters of the model.""" + params = [] + for layer in self.layers: + if isinstance(layer, LayerWithParameters) or isinstance(layer, StochasticLayerWithParameters): + params += layer.params + return params + + def fprop(self, inputs, evaluation=False): + """Forward propagates a batch of inputs through the model. + + Args: + inputs: Batch of inputs to the model. + + Returns: + List of the activations at the output of all layers of the model + plus the inputs (to the first layer) as the first element. The + last element of the list corresponds to the model outputs. + """ + activations = [inputs] + for i, layer in enumerate(self.layers): + if evaluation: + if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]), + StochasticLayerWithParameters): + current_activations = self.layers[i].fprop(activations[i], stochastic=False) + else: + current_activations = self.layers[i].fprop(activations[i]) + else: + if issubclass(type(self.layers[i]), StochasticLayer) or issubclass(type(self.layers[i]), + StochasticLayerWithParameters): + current_activations = self.layers[i].fprop(activations[i], stochastic=True) + else: + current_activations = self.layers[i].fprop(activations[i]) + activations.append(current_activations) + return activations + + def grads_wrt_params(self, activations, grads_wrt_outputs): + """Calculates gradients with respect to the model parameters. + + Args: + activations: List of all activations from forward pass through + model using `fprop`. + grads_wrt_outputs: Gradient with respect to the model outputs of + the scalar function parameter gradients are being calculated + for. + + Returns: + List of gradients of the scalar function with respect to all model + parameters. + """ + grads_wrt_params = [] + for i, layer in enumerate(self.layers[::-1]): + inputs = activations[-i - 2] + outputs = activations[-i - 1] + grads_wrt_inputs = layer.bprop(inputs, outputs, grads_wrt_outputs) + if isinstance(layer, LayerWithParameters) or isinstance(layer, StochasticLayerWithParameters): + grads_wrt_params += layer.grads_wrt_params( + inputs, grads_wrt_outputs)[::-1] + grads_wrt_outputs = grads_wrt_inputs + return grads_wrt_params[::-1] + + def __repr__(self): + return ( + 'MultiLayerModel(\n ' + + '\n '.join([str(layer) for layer in self.layers]) + + '\n)' + ) diff --git a/mlp/optimisers.py b/mlp/optimisers.py new file mode 100644 index 00000000..8ab313af --- /dev/null +++ b/mlp/optimisers.py @@ -0,0 +1,148 @@ +# -*- coding: utf-8 -*- +"""Model optimisers. + +This module contains objects implementing (batched) stochastic gradient descent +based optimisation of models. +""" + +import time +import logging +from collections import OrderedDict +import numpy as np +import tqdm + +logger = logging.getLogger(__name__) + + +class Optimiser(object): + """Basic model optimiser.""" + + def __init__(self, model, error, learning_rule, train_dataset, + valid_dataset=None, data_monitors=None, notebook=False): + """Create a new optimiser instance. + + Args: + model: The model to optimise. + error: The scalar error function to minimise. + learning_rule: Gradient based learning rule to use to minimise + error. + train_dataset: Data provider for training set data batches. + valid_dataset: Data provider for validation set data batches. + data_monitors: Dictionary of functions evaluated on targets and + model outputs (averaged across both full training and + validation data sets) to monitor during training in addition + to the error. Keys should correspond to a string label for + the statistic being evaluated. + """ + self.model = model + self.error = error + self.learning_rule = learning_rule + self.learning_rule.initialise(self.model.params) + self.train_dataset = train_dataset + self.valid_dataset = valid_dataset + self.data_monitors = OrderedDict([('error', error)]) + if data_monitors is not None: + self.data_monitors.update(data_monitors) + self.notebook = notebook + if notebook: + self.tqdm_progress = tqdm.tqdm_notebook + else: + self.tqdm_progress = tqdm.tqdm + + def do_training_epoch(self): + """Do a single training epoch. + + This iterates through all batches in training dataset, for each + calculating the gradient of the estimated error given the batch with + respect to all the model parameters and then updates the model + parameters according to the learning rule. + """ + with self.tqdm_progress(total=self.train_dataset.num_batches) as train_progress_bar: + train_progress_bar.set_description("Epoch Progress") + for inputs_batch, targets_batch in self.train_dataset: + activations = self.model.fprop(inputs_batch) + grads_wrt_outputs = self.error.grad(activations[-1], targets_batch) + grads_wrt_params = self.model.grads_wrt_params( + activations, grads_wrt_outputs) + self.learning_rule.update_params(grads_wrt_params) + train_progress_bar.update(1) + + def eval_monitors(self, dataset, label): + """Evaluates the monitors for the given dataset. + + Args: + dataset: Dataset to perform evaluation with. + label: Tag to add to end of monitor keys to identify dataset. + + Returns: + OrderedDict of monitor values evaluated on dataset. + """ + data_mon_vals = OrderedDict([(key + label, 0.) for key + in self.data_monitors.keys()]) + for inputs_batch, targets_batch in dataset: + activations = self.model.fprop(inputs_batch, evaluation=True) + for key, data_monitor in self.data_monitors.items(): + data_mon_vals[key + label] += data_monitor( + activations[-1], targets_batch) + for key, data_monitor in self.data_monitors.items(): + data_mon_vals[key + label] /= dataset.num_batches + return data_mon_vals + + def get_epoch_stats(self): + """Computes training statistics for an epoch. + + Returns: + An OrderedDict with keys corresponding to the statistic labels and + values corresponding to the value of the statistic. + """ + epoch_stats = OrderedDict() + epoch_stats.update(self.eval_monitors(self.train_dataset, '(train)')) + if self.valid_dataset is not None: + epoch_stats.update(self.eval_monitors( + self.valid_dataset, '(valid)')) + return epoch_stats + + def log_stats(self, epoch, epoch_time, stats): + """Outputs stats for a training epoch to a logger. + + Args: + epoch (int): Epoch counter. + epoch_time: Time taken in seconds for the epoch to complete. + stats: Monitored stats for the epoch. + """ + logger.info('Epoch {0}: {1:.1f}s to complete\n {2}'.format( + epoch, epoch_time, + ', '.join(['{0}={1:.2e}'.format(k, v) for (k, v) in stats.items()]) + )) + + def train(self, num_epochs, stats_interval=5): + """Trains a model for a set number of epochs. + + Args: + num_epochs: Number of epochs (complete passes through trainin + dataset) to train for. + stats_interval: Training statistics will be recorded and logged + every `stats_interval` epochs. + + Returns: + Tuple with first value being an array of training run statistics + and the second being a dict mapping the labels for the statistics + recorded to their column index in the array. + """ + start_train_time = time.time() + run_stats = [list(self.get_epoch_stats().values())] + with self.tqdm_progress(total=num_epochs) as progress_bar: + progress_bar.set_description("Experiment Progress") + for epoch in range(1, num_epochs + 1): + start_time = time.time() + self.do_training_epoch() + epoch_time = time.time()- start_time + if epoch % stats_interval == 0: + stats = self.get_epoch_stats() + self.log_stats(epoch, epoch_time, stats) + run_stats.append(list(stats.values())) + progress_bar.update(1) + finish_train_time = time.time() + total_train_time = finish_train_time - start_train_time + return np.array(run_stats), {k: i for i, k in enumerate(stats.keys())}, total_train_time + diff --git a/mlp/penalties.py b/mlp/penalties.py new file mode 100644 index 00000000..28764344 --- /dev/null +++ b/mlp/penalties.py @@ -0,0 +1,90 @@ +import numpy as np + +seed = 22102017 +rng = np.random.RandomState(seed) + + +class L1Penalty(object): + """L1 parameter penalty. + + Term to add to the objective function penalising parameters + based on their L1 norm. + """ + + def __init__(self, coefficient): + """Create a new L1 penalty object. + + Args: + coefficient: Positive constant to scale penalty term by. + """ + assert coefficient > 0., 'Penalty coefficient must be positive.' + self.coefficient = coefficient + + def __call__(self, parameter): + """Calculate L1 penalty value for a parameter. + + Args: + parameter: Array corresponding to a model parameter. + + Returns: + Value of penalty term. + """ + return self.coefficient * abs(parameter).sum() + + def grad(self, parameter): + """Calculate the penalty gradient with respect to the parameter. + + Args: + parameter: Array corresponding to a model parameter. + + Returns: + Value of penalty gradient with respect to parameter. This + should be an array of the same shape as the parameter. + """ + return self.coefficient * np.sign(parameter) + + def __repr__(self): + return 'L1Penalty({0})'.format(self.coefficient) + + +class L2Penalty(object): + """L1 parameter penalty. + + Term to add to the objective function penalising parameters + based on their L2 norm. + """ + + def __init__(self, coefficient): + """Create a new L2 penalty object. + + Args: + coefficient: Positive constant to scale penalty term by. + """ + assert coefficient > 0., 'Penalty coefficient must be positive.' + self.coefficient = coefficient + + def __call__(self, parameter): + """Calculate L2 penalty value for a parameter. + + Args: + parameter: Array corresponding to a model parameter. + + Returns: + Value of penalty term. + """ + return 0.5 * self.coefficient * (parameter ** 2).sum() + + def grad(self, parameter): + """Calculate the penalty gradient with respect to the parameter. + + Args: + parameter: Array corresponding to a model parameter. + + Returns: + Value of penalty gradient with respect to parameter. This + should be an array of the same shape as the parameter. + """ + return self.coefficient * parameter + + def __repr__(self): + return 'L2Penalty({0})'.format(self.coefficient) \ No newline at end of file diff --git a/mlp/schedulers.py b/mlp/schedulers.py new file mode 100644 index 00000000..4f53e7ee --- /dev/null +++ b/mlp/schedulers.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +"""Training schedulers. + +This module contains classes implementing schedulers which control the +evolution of learning rule hyperparameters (such as learning rate) over a +training run. +""" + +import numpy as np + + +class ConstantLearningRateScheduler(object): + """Example of scheduler interface which sets a constant learning rate.""" + + def __init__(self, learning_rate): + """Construct a new constant learning rate scheduler object. + + Args: + learning_rate: Learning rate to use in learning rule. + """ + self.learning_rate = learning_rate + + def update_learning_rule(self, learning_rule, epoch_number): + """Update the hyperparameters of the learning rule. + + Run at the beginning of each epoch. + + Args: + learning_rule: Learning rule object being used in training run, + any scheduled hyperparameters to be altered should be + attributes of this object. + epoch_number: Integer index of training epoch about to be run. + """ + learning_rule.learning_rate = self.learning_rate diff --git a/notebooks/Coursework_2_Pytorch_Introduction.ipynb b/notebooks/Coursework_2_Pytorch_Introduction.ipynb new file mode 100644 index 00000000..3c79a18d --- /dev/null +++ b/notebooks/Coursework_2_Pytorch_Introduction.ipynb @@ -0,0 +1,665 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction to PyTorch \n", + "\n", + "## Introduction\n", + "Pytorch is a modern, intuitive, Pythonic and fast framework for building differentiable graphs. Neural networks, as it happens, are a type of acyclic differentiable graph, making PyTorch a convenient framework to use, should you wish to build (potentially) complicated deep neural networks fairly easily.\n", + "\n", + "## MLP package vs Pytorch\n", + "**Student**: Why do I have to learn to use PyTorch now? I've spent all this time working on the MLP framework. Was that a waste of time?\n", + "\n", + "**TA**: Pytorch is everything the MLP package is, and more. It's faster, cleaner and far more up to date with modern deep learning advances, meaning it is easy to tailor to experiments you may wish to run. Since it is one of the main deep learning frameworks being used by industry and research alike, it conforms to the expectation of real users like researchers and engineers. The result is that PyTorch is (and continues to become) a robust and flexible package. Coming to grips with PyTorch now means that you'll be able to apply it to any future project that uses deep learning. \n", + "\n", + "Furthermore, the MLP framework was written in NumPy and your time developing this has taught you some fundamental implementation details of NNs: this could (and should) make future research directions more easy to think of and will also enable your debugging prowess. PyTorch was written to emulate NumPy as much as possible, so it will feel very familiar to you. The skills you have acquired are highly transferable (they generalize well, so not much overfitting there!).\n", + "\n", + "The devleopers of PyTorch try to make sure that the \"latest and greatest\" state-of-the-art research is included and implemented. If this is not the case, you will often find other people reproducing . If you can't wait, you can reproduce it yourself and open source it (a great way to showcase your skills and get github likes).\n", + "\n", + "PyTorch has Autograd! Automatic differentiation. \"What is this?\" you may ask. Remember having to write all those backprop functions? Forget about it. Automatic differentiation allows you to backprop through any PyTorch operation you have used in your graph, by simply calling backward(). This [blog-post](https://jdhao.github.io/2017/11/12/pytorch-computation-graph/) explains how Pytorch's autograd works at an intuitive level.\n", + "\n", + "**Student**: Why did we even have to use the MLP package? Why did we even bother if such awesome frameworks are available?\n", + "\n", + "**TA**: The purpose of the MLP package was not to allow you to build fast deep learning systems. Instead, it was to help teach you the low level mechanics and sensitivities of building a deep learning system. Building this enabled you to dive deep into how to go about building a deep learning framework from scratch. The intuitions you have gained from going through your assignments and courseworks allow you to see deeper in what makes or breaks a deep learning system, at a level few people actually care to explore. You are no longer restricted to the higher level modules provided by Pytorch/TensorFlow. \n", + "\n", + "If, for example, a new project required you to build something that does not exist in PyTorch/TensorFlow, or otherwise modify existing modules in a way that requires understanding and intuitions on backpropagation and layer/optimizer/component implementation, you would be able to do it much more easily than others who did not. You are now equipped to understand differentiable graphs, the chain rule, numerical errors, debugging at the lowest level and deep learning system architecture. \n", + "\n", + "By trying to implement your modules in an efficient way, you have also become aware of how to optimize a system for efficiency, and gave you intuitions on how one could further improve such a system (parallelization of implementations). \n", + "\n", + "Finally, the slowness of CPU training has allowed you to understand just how important modern GPU acceleration is, for deep learning research and applications. By coming across a large breadth of problems and understanding their origins, you will now be able to both anticipate and solve future problems in a more comprehensive way than someone who did not go through the trouble of implementing the basics from scratch. \n", + "\n", + "\n", + "\n", + "## Getting Started\n", + "\n", + "**Student**: So, how is the learning curve of Pytorch? How do I start?\n", + "\n", + "**TA**: You can start by using this notebook on your experiments, it should teach you quite a lot on how to properly use PyTorch for basic conv net training. You should be aware of the [official pytorch github](https://github.com/pytorch/pytorch), the [pytorch official documentation page](https://pytorch.org/docs/stable/nn.html) and the [pytorch tutorials page](https://pytorch.org/tutorials/). \n", + "\n", + "Over the past year, nearly all students using PyTorch and Tensorflow on MLP and on projects found it easier and faster to get up to speed with PyTorch. In fact, I was a TensorFlow user myself, and learning TensorFlow was much more challenging than PyTorch. Mainly because TensorFlow has its own way of 'thinking' about how you build a graph and execute operations - whereas PyTorch is dynamic and works like NumPy, hence is more intuitive. If you were able to work well with the MLP package, you'll be up and running in no time. \n", + "\n", + "**Student**: OK, so how fast is pytorch compared to MLP?\n", + "\n", + "**TA**: On the CPU side of things, you'll find pytorch at least 5x faster than the MLP framework (about equal for fully connected networks, but much faster for more complicated things like convolutions - unless you write extremely efficient convolutional layer code), and if you choose to use GPUs, either using MS Azure, Google Cloud or our very own MLP Cluster (available for next semester), you can expect, depending on implementation and hardware an approximate 25-70x speed ups, compared to the CPU performance of pytorch. Yes, that means an experiment that would run overnight, now would only require about 15 minutes.\n", + "\n", + "**Student**: Ahh, where should I go to ask more questions?\n", + "\n", + "**TA**: As always, start with a Google/DuckDuckGo search, then have a look at the PyTorch Github and PyTorch docs, and if you can't find the answer come to Piazza and the lab sessions. We will be there to support you.\n", + "\n", + "\n", + "#### Note: The code in this jupyter notebook is to introduce you to pytorch and allow you to play around with it in an interactive manner. However, to run your experiments, you should use the Pytorch experiment framework located in ```pytorch_mlp_framework/```. Instructions on how to use it can be found in ```notes/pytorch-experiment-framework.md``` along with the comments and documentation included in the code itself." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports and helper functions\n", + "\n", + "First, let's import the packages necessary for our tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "from torch import nn\n", + "from copy import deepcopy\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "import torch.nn.functional as F\n", + "import torch.backends.cudnn as cudnn\n", + "import torchvision\n", + "import tqdm\n", + "import os\n", + "import mlp.data_providers as data_providers\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's write a helper function for plotting" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "plt.style.use('ggplot')\n", + "\n", + "def plot_stats_in_graph(total_losses, y_axis_label, x_axis_label):\n", + " \n", + " # Plot the change in the validation and training set error over training.\n", + " fig_1 = plt.figure(figsize=(8, 4))\n", + " ax_1 = fig_1.add_subplot(111)\n", + " for k in total_losses.keys():\n", + " if \"loss\" in k:\n", + " ax_1.plot(np.arange(len(total_losses[k])), total_losses[k], label=k)\n", + " ax_1.legend(loc=0)\n", + " ax_1.set_xlabel(x_axis_label)\n", + " ax_1.set_ylabel(y_axis_label)\n", + " \n", + "\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basics: What is a tensor?\n", + "\n", + "In numpy we used arrays, whereas in pytorch we use tensors. Tensors are basically multi-dimensional arrays, that can also automatically compute backward passes, and thus gradients, as well as store data to be used at any point in our pytorch pipelines." + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([ 5., 1., 10.]) tensor(5.3333) tensor(3.6818) \n", + " [ 5. 1. 10.] 5.3333335 3.6817868\n" + ] + } + ], + "source": [ + "data_pytorch = torch.Tensor([5., 1., 10.]).float()\n", + "data_numpy = np.array([5., 1., 10]).astype(np.float32)\n", + "\n", + "print(data_pytorch, data_pytorch.mean(), data_pytorch.std(unbiased=False), '\\n',\n", + " data_numpy, data_numpy.mean(), data_numpy.std())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Tensors have a rich support for a variety of operations, for more information look at the official pytorch [documentation page](https://pytorch.org/docs/stable/torch.html#torch.std)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basics: A simple pytorch graph of operations\n", + "\n", + "Pytorch automatically tracks the flow of data through operations without requiring explicit instruction to do so. \n", + "For example, we can easily compute the grads wrt to a variable **a** (which is initialized with requires grad = True to let the framework know that we'll be requiring the grads of that variable) by simple calling .backward() followed by .grad:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[[[0.0016, 0.0018, 0.0013, ..., 0.0024, 0.0022, 0.0012],\n", + " [0.0017, 0.0027, 0.0025, ..., 0.0022, 0.0016, 0.0023],\n", + " [0.0017, 0.0023, 0.0020, ..., 0.0012, 0.0019, 0.0027],\n", + " ...,\n", + " [0.0020, 0.0011, 0.0018, ..., 0.0025, 0.0028, 0.0026],\n", + " [0.0022, 0.0021, 0.0016, ..., 0.0018, 0.0016, 0.0024],\n", + " [0.0017, 0.0023, 0.0022, ..., 0.0025, 0.0024, 0.0022]],\n", + "\n", + " [[0.0021, 0.0025, 0.0019, ..., 0.0017, 0.0022, 0.0026],\n", + " [0.0024, 0.0020, 0.0020, ..., 0.0025, 0.0018, 0.0017],\n", + " [0.0025, 0.0023, 0.0019, ..., 0.0017, 0.0024, 0.0013],\n", + " ...,\n", + " [0.0027, 0.0014, 0.0022, ..., 0.0015, 0.0012, 0.0021],\n", + " [0.0030, 0.0019, 0.0025, ..., 0.0029, 0.0027, 0.0032],\n", + " [0.0021, 0.0024, 0.0021, ..., 0.0019, 0.0018, 0.0021]],\n", + "\n", + " [[0.0019, 0.0026, 0.0024, ..., 0.0029, 0.0023, 0.0023],\n", + " [0.0016, 0.0017, 0.0021, ..., 0.0023, 0.0016, 0.0022],\n", + " [0.0021, 0.0026, 0.0023, ..., 0.0019, 0.0021, 0.0021],\n", + " ...,\n", + " [0.0011, 0.0026, 0.0020, ..., 0.0017, 0.0020, 0.0021],\n", + " [0.0022, 0.0025, 0.0024, ..., 0.0027, 0.0019, 0.0017],\n", + " [0.0015, 0.0020, 0.0015, ..., 0.0019, 0.0024, 0.0022]]],\n", + "\n", + "\n", + " [[[0.0019, 0.0026, 0.0030, ..., 0.0017, 0.0020, 0.0030],\n", + " [0.0012, 0.0029, 0.0026, ..., 0.0020, 0.0024, 0.0019],\n", + " [0.0021, 0.0019, 0.0024, ..., 0.0032, 0.0022, 0.0020],\n", + " ...,\n", + " [0.0021, 0.0023, 0.0020, ..., 0.0024, 0.0020, 0.0019],\n", + " [0.0021, 0.0020, 0.0026, ..., 0.0024, 0.0017, 0.0022],\n", + " [0.0019, 0.0022, 0.0021, ..., 0.0024, 0.0023, 0.0024]],\n", + "\n", + " [[0.0019, 0.0027, 0.0015, ..., 0.0027, 0.0020, 0.0023],\n", + " [0.0020, 0.0025, 0.0021, ..., 0.0020, 0.0020, 0.0020],\n", + " [0.0019, 0.0017, 0.0019, ..., 0.0019, 0.0018, 0.0025],\n", + " ...,\n", + " [0.0024, 0.0022, 0.0026, ..., 0.0013, 0.0020, 0.0026],\n", + " [0.0023, 0.0017, 0.0021, ..., 0.0024, 0.0018, 0.0026],\n", + " [0.0017, 0.0019, 0.0023, ..., 0.0020, 0.0020, 0.0024]],\n", + "\n", + " [[0.0018, 0.0030, 0.0020, ..., 0.0024, 0.0028, 0.0019],\n", + " [0.0026, 0.0023, 0.0026, ..., 0.0023, 0.0022, 0.0022],\n", + " [0.0023, 0.0024, 0.0013, ..., 0.0025, 0.0020, 0.0027],\n", + " ...,\n", + " [0.0024, 0.0018, 0.0024, ..., 0.0012, 0.0021, 0.0023],\n", + " [0.0019, 0.0016, 0.0016, ..., 0.0024, 0.0019, 0.0021],\n", + " [0.0029, 0.0020, 0.0018, ..., 0.0022, 0.0021, 0.0021]]],\n", + "\n", + "\n", + " [[[0.0020, 0.0022, 0.0014, ..., 0.0013, 0.0019, 0.0025],\n", + " [0.0020, 0.0023, 0.0021, ..., 0.0021, 0.0017, 0.0019],\n", + " [0.0023, 0.0024, 0.0021, ..., 0.0024, 0.0024, 0.0028],\n", + " ...,\n", + " [0.0025, 0.0018, 0.0017, ..., 0.0024, 0.0014, 0.0023],\n", + " [0.0029, 0.0026, 0.0024, ..., 0.0030, 0.0025, 0.0022],\n", + " [0.0018, 0.0017, 0.0025, ..., 0.0024, 0.0024, 0.0027]],\n", + "\n", + " [[0.0021, 0.0021, 0.0020, ..., 0.0020, 0.0017, 0.0025],\n", + " [0.0021, 0.0018, 0.0014, ..., 0.0019, 0.0014, 0.0018],\n", + " [0.0027, 0.0023, 0.0023, ..., 0.0024, 0.0023, 0.0030],\n", + " ...,\n", + " [0.0025, 0.0023, 0.0016, ..., 0.0028, 0.0020, 0.0021],\n", + " [0.0032, 0.0021, 0.0018, ..., 0.0024, 0.0021, 0.0030],\n", + " [0.0025, 0.0021, 0.0011, ..., 0.0019, 0.0021, 0.0022]],\n", + "\n", + " [[0.0018, 0.0021, 0.0016, ..., 0.0022, 0.0019, 0.0018],\n", + " [0.0023, 0.0031, 0.0017, ..., 0.0026, 0.0024, 0.0023],\n", + " [0.0020, 0.0022, 0.0013, ..., 0.0021, 0.0028, 0.0024],\n", + " ...,\n", + " [0.0018, 0.0013, 0.0023, ..., 0.0021, 0.0021, 0.0019],\n", + " [0.0025, 0.0005, 0.0016, ..., 0.0021, 0.0017, 0.0015],\n", + " [0.0026, 0.0021, 0.0012, ..., 0.0021, 0.0018, 0.0021]]],\n", + "\n", + "\n", + " ...,\n", + "\n", + "\n", + " [[[0.0014, 0.0020, 0.0025, ..., 0.0020, 0.0016, 0.0021],\n", + " [0.0025, 0.0022, 0.0020, ..., 0.0018, 0.0017, 0.0025],\n", + " [0.0021, 0.0016, 0.0020, ..., 0.0021, 0.0023, 0.0025],\n", + " ...,\n", + " [0.0025, 0.0016, 0.0029, ..., 0.0024, 0.0022, 0.0024],\n", + " [0.0015, 0.0028, 0.0024, ..., 0.0020, 0.0017, 0.0021],\n", + " [0.0027, 0.0022, 0.0018, ..., 0.0025, 0.0022, 0.0019]],\n", + "\n", + " [[0.0027, 0.0024, 0.0019, ..., 0.0026, 0.0019, 0.0013],\n", + " [0.0029, 0.0019, 0.0021, ..., 0.0027, 0.0024, 0.0023],\n", + " [0.0022, 0.0013, 0.0018, ..., 0.0022, 0.0015, 0.0025],\n", + " ...,\n", + " [0.0020, 0.0017, 0.0020, ..., 0.0023, 0.0024, 0.0024],\n", + " [0.0024, 0.0021, 0.0021, ..., 0.0026, 0.0026, 0.0027],\n", + " [0.0022, 0.0019, 0.0030, ..., 0.0022, 0.0023, 0.0022]],\n", + "\n", + " [[0.0029, 0.0017, 0.0022, ..., 0.0021, 0.0023, 0.0020],\n", + " [0.0014, 0.0021, 0.0020, ..., 0.0024, 0.0019, 0.0019],\n", + " [0.0025, 0.0024, 0.0020, ..., 0.0021, 0.0020, 0.0020],\n", + " ...,\n", + " [0.0023, 0.0020, 0.0020, ..., 0.0028, 0.0021, 0.0025],\n", + " [0.0019, 0.0021, 0.0022, ..., 0.0021, 0.0025, 0.0020],\n", + " [0.0017, 0.0023, 0.0023, ..., 0.0028, 0.0021, 0.0014]]],\n", + "\n", + "\n", + " [[[0.0020, 0.0018, 0.0020, ..., 0.0025, 0.0021, 0.0024],\n", + " [0.0019, 0.0023, 0.0023, ..., 0.0021, 0.0014, 0.0018],\n", + " [0.0022, 0.0022, 0.0022, ..., 0.0025, 0.0027, 0.0025],\n", + " ...,\n", + " [0.0016, 0.0023, 0.0016, ..., 0.0020, 0.0025, 0.0012],\n", + " [0.0022, 0.0018, 0.0020, ..., 0.0026, 0.0020, 0.0019],\n", + " [0.0025, 0.0024, 0.0021, ..., 0.0022, 0.0022, 0.0026]],\n", + "\n", + " [[0.0029, 0.0022, 0.0022, ..., 0.0029, 0.0016, 0.0018],\n", + " [0.0021, 0.0021, 0.0023, ..., 0.0027, 0.0024, 0.0026],\n", + " [0.0012, 0.0023, 0.0025, ..., 0.0028, 0.0016, 0.0022],\n", + " ...,\n", + " [0.0021, 0.0020, 0.0017, ..., 0.0023, 0.0021, 0.0020],\n", + " [0.0027, 0.0012, 0.0012, ..., 0.0023, 0.0015, 0.0017],\n", + " [0.0024, 0.0021, 0.0020, ..., 0.0011, 0.0018, 0.0020]],\n", + "\n", + " [[0.0017, 0.0019, 0.0022, ..., 0.0026, 0.0018, 0.0009],\n", + " [0.0021, 0.0020, 0.0028, ..., 0.0018, 0.0017, 0.0026],\n", + " [0.0023, 0.0020, 0.0022, ..., 0.0023, 0.0019, 0.0016],\n", + " ...,\n", + " [0.0023, 0.0023, 0.0019, ..., 0.0026, 0.0016, 0.0024],\n", + " [0.0019, 0.0022, 0.0015, ..., 0.0021, 0.0018, 0.0024],\n", + " [0.0017, 0.0018, 0.0028, ..., 0.0020, 0.0017, 0.0031]]],\n", + "\n", + "\n", + " [[[0.0017, 0.0021, 0.0019, ..., 0.0020, 0.0026, 0.0022],\n", + " [0.0023, 0.0021, 0.0017, ..., 0.0016, 0.0018, 0.0019],\n", + " [0.0015, 0.0020, 0.0022, ..., 0.0015, 0.0028, 0.0027],\n", + " ...,\n", + " [0.0020, 0.0019, 0.0015, ..., 0.0019, 0.0018, 0.0019],\n", + " [0.0025, 0.0026, 0.0021, ..., 0.0015, 0.0023, 0.0023],\n", + " [0.0016, 0.0019, 0.0022, ..., 0.0022, 0.0011, 0.0024]],\n", + "\n", + " [[0.0019, 0.0013, 0.0020, ..., 0.0015, 0.0026, 0.0027],\n", + " [0.0022, 0.0017, 0.0022, ..., 0.0016, 0.0017, 0.0023],\n", + " [0.0028, 0.0026, 0.0013, ..., 0.0029, 0.0026, 0.0017],\n", + " ...,\n", + " [0.0028, 0.0018, 0.0021, ..., 0.0025, 0.0017, 0.0022],\n", + " [0.0026, 0.0016, 0.0019, ..., 0.0026, 0.0016, 0.0019],\n", + " [0.0020, 0.0015, 0.0021, ..., 0.0027, 0.0027, 0.0011]],\n", + "\n", + " [[0.0028, 0.0024, 0.0025, ..., 0.0020, 0.0026, 0.0020],\n", + " [0.0027, 0.0022, 0.0013, ..., 0.0021, 0.0027, 0.0026],\n", + " [0.0018, 0.0016, 0.0024, ..., 0.0020, 0.0022, 0.0024],\n", + " ...,\n", + " [0.0026, 0.0017, 0.0020, ..., 0.0024, 0.0021, 0.0012],\n", + " [0.0019, 0.0022, 0.0020, ..., 0.0025, 0.0028, 0.0019],\n", + " [0.0023, 0.0019, 0.0018, ..., 0.0017, 0.0021, 0.0020]]]])\n" + ] + } + ], + "source": [ + "a = torch.randn((32, 3, 14, 14), requires_grad=True)\n", + "b = torch.ones((32, 3, 14, 14)) * 5\n", + "\n", + "result_addition = a + b\n", + "result_double = result_addition * 2\n", + "result_square = result_double ** 2\n", + "result_mean = result_square.mean()\n", + "\n", + "loss = result_mean\n", + "\n", + "loss.backward()\n", + "\n", + "print(a.grad)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Student**: Ok, so we can build graphs, what about neural networks? Are there any pre-built layers? How do we train things? How do we define parameters and biases for our models? \n", + "\n", + "**TA**: Don't rush. Let's take it step by step. Let's look at nn.Parameters first.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**TA**: In Pytorch all learnable components are created using the nn.Parameter class. That class, automatically tracks all gradients, and allows quick and easy updates in a given graph.\n", + "\n", + "**Note**: np.dot for a single batch going to a single 2D weight matrix is called using F.linear in Pytorch.\n", + "\n", + "**Further Note**: There also exist ParameterDicts for dictionaries of parameters, and ParameterLists when you define a list of parameters for part of your model." + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([16, 32])\n", + "current loss tensor(-0.0286, grad_fn=)\n", + "current loss tensor(-0.0366, grad_fn=)\n", + "current loss tensor(-0.0524, grad_fn=)\n", + "current loss tensor(-0.0762, grad_fn=)\n", + "current loss tensor(-0.1079, grad_fn=)\n", + "current loss tensor(-0.1475, grad_fn=)\n", + "current loss tensor(-0.1950, grad_fn=)\n", + "current loss tensor(-0.2505, grad_fn=)\n", + "current loss tensor(-0.3139, grad_fn=)\n", + "current loss tensor(-0.3852, grad_fn=)\n" + ] + } + ], + "source": [ + "weights = nn.Parameter(torch.randn(32, 32), requires_grad=True)\n", + "inputs = torch.randn(16, 32)\n", + "outputs = F.linear(inputs, weights)\n", + "learning_rate = 0.1\n", + "\n", + "print(outputs.shape)\n", + "\n", + "for i in range(10):\n", + " outputs = F.linear(inputs, weights)\n", + " loss = torch.mean(outputs)\n", + " loss.backward()\n", + " weights.data = weights.data - learning_rate * weights.grad\n", + " print('current loss', loss)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## nn.Modules and why they are important\n", + "\n", + "Pytorch implements a class called the nn.Module class. The nn.Module class automatically detects any nn.Parameter, nn.ParameterList or nn.ParameterDict and adds it to a collection of parameters which can be easily accessed using .parameters and/or .named_parameters().\n", + "\n", + "Let's look at an example:\n", + "\n", + "Let's build a fully connected layer followed by an activation function that can be preselected, similar to coursework 1. " + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [], + "source": [ + "class LinearLayerWithActivation(nn.Module):\n", + " def __init__(self, input_shape, num_units, bias=False, activation_type=nn.ReLU()):\n", + " super(LinearLayerWithActivation, self).__init__()\n", + " self.activation_type = activation_type\n", + " self.weights = nn.Parameter(torch.empty(size=(num_units, input_shape[1]), requires_grad=True))\n", + " \n", + " nn.init.normal_(self.weights)\n", + " \n", + " if bias:\n", + " self.bias = nn.Parameter(torch.zeros(num_units), requires_grad=True)\n", + " else:\n", + " self.bias = None\n", + " \n", + " def forward(self, x):\n", + " out = F.linear(x, self.weights, self.bias)\n", + " out = self.activation_type.forward(out)\n", + " return out\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parameters with name weights and shape torch.Size([512, 128])\n", + "Parameters with name bias and shape torch.Size([512])\n" + ] + } + ], + "source": [ + "x = torch.arange(16*128).view(16, 128).float()\n", + "y = torch.arange((16))\n", + "\n", + "fcc_net = LinearLayerWithActivation(input_shape=x.shape, num_units=512, bias=True, activation_type=nn.Identity())\n", + "optimizer = optim.Adam(fcc_net.parameters(), amsgrad=False, weight_decay=0.0)\n", + "\n", + "\n", + "for name, params in fcc_net.named_parameters():\n", + " print('Parameters with name', name, 'and shape', params.shape)\n", + "\n", + "metric_dict = {'losses': []} \n", + " \n", + "for i in range(50):\n", + "\n", + " out = fcc_net.forward(x)\n", + " loss = F.cross_entropy(out, y)\n", + " fcc_net.zero_grad() #removes grads of previous step\n", + " optimizer.zero_grad() #removes grads of previous step\n", + " loss.backward() #compute gradients of current step\n", + " optimizer.step() #update step\n", + " metric_dict['losses'].append(loss.detach().cpu().numpy()) #.detach: Copies the value of the loss \n", + "# and removes it from the graph, \n", + "# .cpu() sends to cpu, and \n", + "# numpy(), converts it to numpy format." + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_stats_in_graph(metric_dict, y_axis_label='Loss', x_axis_label='Number of Steps')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**TA**: Does that make sense now?\n", + "\n", + "**Student**: Yeah, somewhat. What about more complicated systems? Will I have to implement everything using barebone components like F.linear etc.?\n", + "\n", + "**TA**: You can use existing nn.Modules as components of new nn.Modules therefore, you are able of modularizing your network blocks, and then combining them at the end in one big network with very few lines of code. Pytorch already provides almost every kind of layer out there in their torch.nn package. Look at the [documentation](https://pytorch.org/docs/stable/nn.html) for more information. Now, let's see how we can combine modules to build a larger module. Let's build a multi layer fully connected module.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [], + "source": [ + "class MultiLayerFCCNetwork(nn.Module):\n", + " def __init__(self, input_shape, num_hidden_units, num_output_units, num_hidden_layers):\n", + " super(MultiLayerFCCNetwork, self).__init__()\n", + " self.input_shape = input_shape\n", + " self.num_hidden_units = num_hidden_units\n", + " self.num_output_units = num_output_units\n", + " self.num_hidden_layers = num_hidden_layers\n", + " \n", + " x_dummy = torch.zeros(input_shape)\n", + " \n", + " self.layer_dict = nn.ModuleDict() # Allows us to initialize modules within a dictionary structure.\n", + " out = x_dummy\n", + " for i in range(self.num_hidden_layers):\n", + " self.layer_dict['layer_{}'.format(i)] = LinearLayerWithActivation(input_shape=out.shape, \n", + " num_units=self.num_hidden_units, bias=True,\n", + " activation_type=nn.PReLU())\n", + " \n", + " out = self.layer_dict['layer_{}'.format(i)].forward(out)\n", + " \n", + " self.layer_dict['output_layer'] = LinearLayerWithActivation(input_shape=out.shape, \n", + " num_units=self.num_output_units, \n", + " bias=True, activation_type=nn.Identity())\n", + " out = self.layer_dict['output_layer'].forward(out)\n", + " \n", + " def forward(self, x):\n", + " out = x\n", + " for i in range(self.num_hidden_layers):\n", + " out = self.layer_dict['layer_{}'.format(i)].forward(out)\n", + "\n", + " out = self.layer_dict['output_layer'].forward(out)\n", + " return out\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parameters with name layer_dict.layer_0.weights and shape torch.Size([64, 128])\n", + "Parameters with name layer_dict.layer_0.bias and shape torch.Size([64])\n", + "Parameters with name layer_dict.layer_0.activation_type.weight and shape torch.Size([1])\n", + "Parameters with name layer_dict.layer_1.weights and shape torch.Size([64, 64])\n", + "Parameters with name layer_dict.layer_1.bias and shape torch.Size([64])\n", + "Parameters with name layer_dict.layer_1.activation_type.weight and shape torch.Size([1])\n", + "Parameters with name layer_dict.layer_2.weights and shape torch.Size([64, 64])\n", + "Parameters with name layer_dict.layer_2.bias and shape torch.Size([64])\n", + "Parameters with name layer_dict.layer_2.activation_type.weight and shape torch.Size([1])\n", + "Parameters with name layer_dict.layer_3.weights and shape torch.Size([64, 64])\n", + "Parameters with name layer_dict.layer_3.bias and shape torch.Size([64])\n", + "Parameters with name layer_dict.layer_3.activation_type.weight and shape torch.Size([1])\n", + "Parameters with name layer_dict.output_layer.weights and shape torch.Size([512, 64])\n", + "Parameters with name layer_dict.output_layer.bias and shape torch.Size([512])\n" + ] + } + ], + "source": [ + "fcc_net = MultiLayerFCCNetwork(input_shape=x.shape, num_hidden_units=64, num_output_units=512, \n", + " num_hidden_layers=4)\n", + "optimizer = optim.Adam(fcc_net.parameters(), amsgrad=False, weight_decay=0.0)\n", + "\n", + "\n", + "for name, params in fcc_net.named_parameters():\n", + " print('Parameters with name', name, 'and shape', params.shape)\n", + "\n", + "metric_dict = {'losses': []} \n", + " \n", + "for i in range(100):\n", + "\n", + " out = fcc_net.forward(x)\n", + " loss = F.cross_entropy(out, y)\n", + " fcc_net.zero_grad() #removes grads of previous step\n", + " optimizer.zero_grad() #removes grads of previous step\n", + " loss.backward() #compute gradients of current step\n", + " optimizer.step() #update step\n", + "\n", + " metric_dict['losses'].append(loss.detach().cpu().numpy()) #.detach: Copies the value of the loss \n", + "# and removes it from the graph, \n", + "# .cpu() sends to cpu, and \n", + "# numpy(), converts it to numpy format." + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_stats_in_graph(metric_dict, y_axis_label='Loss', x_axis_label='Number of Steps')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**TA**: There we go, the network is doing much better during training with a multi-layer neural network. :)\n", + "\n", + "**Student**: Hmm.. I am weirdly excited even though I have not digested this completely yet. Where do I go to learn more? \n", + "\n", + "**TA**: Firstly, I think you should go and have a look at the MLP Pytorch Framework, so you can learn how Pytorch can be used with more complicated architectures, as well as to learn some good coding practices for research and industry alike. When you are working on your coursework, make sure to have the [pytorch official documentation page](https://pytorch.org/docs/stable/nn.html) open in your browser, as it is extremely well written most of the times. Then, when you have some spare time, perhaps in preparation for next term, I would recommend going through some of the Pytorch tutorials at the [pytorch tutorials page](https://pytorch.org/tutorials/). Finally, the best way to learn, in my opinion, is by engaging with Pytorch through a project that interests you." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/Plot_Results.ipynb b/notebooks/Plot_Results.ipynb new file mode 100644 index 00000000..5cb3a3a4 --- /dev/null +++ b/notebooks/Plot_Results.ipynb @@ -0,0 +1,200 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "%matplotlib inline\n", + "plt.style.use('ggplot')\n", + "experiment_dir = 'path/to/mlpractical_directory'" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def collect_experiment_dicts(target_dir, test_flag=False):\n", + " experiment_dicts = dict()\n", + " for subdir, dir, files in os.walk(target_dir):\n", + " for file in files:\n", + " filepath = None\n", + " if not test_flag:\n", + " if file == 'summary.csv':\n", + " filepath = os.path.join(subdir, file)\n", + " \n", + " elif test_flag:\n", + " if file == 'test_summary.csv':\n", + " filepath = os.path.join(subdir, file)\n", + " \n", + " if filepath is not None:\n", + " \n", + " with open(filepath, 'r') as read_file:\n", + " lines = read_file.readlines()\n", + " \n", + " current_experiment_dict = {key: [] for key in lines[0].replace('\\n', '').split(',')}\n", + " idx_to_key = {idx: key for idx, key in enumerate(lines[0].replace('\\n', '').split(','))}\n", + " \n", + " for line in lines[1:]:\n", + " for idx, value in enumerate(line.replace('\\n', '').split(',')):\n", + " current_experiment_dict[idx_to_key[idx]].append(float(value))\n", + " \n", + " experiment_dicts[subdir.split('/')[-2]] = current_experiment_dict\n", + " \n", + " return experiment_dicts\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "VGG_08 ['train_acc', 'train_loss', 'val_acc', 'val_loss']\n", + "VGG_38 ['train_acc', 'train_loss', 'val_acc', 'val_loss']\n" + ] + } + ], + "source": [ + "result_dict = collect_experiment_dicts(target_dir=experiment_dir)\n", + "for key, value in result_dict.items():\n", + " print(key, list(value.keys()))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "plt.style.use('ggplot')\n", + "\n", + "def plot_result_graphs(plot_name, stats, keys_to_plot, notebook=True):\n", + " \n", + " fig_1 = plt.figure(figsize=(8, 4))\n", + " ax_1 = fig_1.add_subplot(111)\n", + " for name in keys_to_plot:\n", + " for k in ['train_loss', 'val_loss']:\n", + " item = stats[name][k]\n", + " ax_1.plot(np.arange(0, len(item)), \n", + " item, label='{}_{}'.format(name, k))\n", + " \n", + " ax_1.legend(loc=0)\n", + " ax_1.set_ylabel('Loss')\n", + " ax_1.set_xlabel('Epoch number')\n", + "\n", + " # Plot the change in the validation and training set accuracy over training.\n", + " fig_2 = plt.figure(figsize=(8, 4))\n", + " ax_2 = fig_2.add_subplot(111)\n", + " for name in keys_to_plot:\n", + " for k in ['train_acc', 'val_acc']:\n", + " item = stats[name][k]\n", + " ax_2.plot(np.arange(0, len(item)), \n", + " item, label='{}_{}'.format(name, k))\n", + " \n", + " ax_2.legend(loc=0)\n", + " ax_2.set_ylabel('Accuracy')\n", + " ax_2.set_xlabel('Epoch number')\n", + " \n", + " fig_1.savefig('../data/{}_loss_performance.pdf'.format(plot_name), dpi=None, facecolor='w', edgecolor='w',\n", + " orientation='portrait', papertype=None, format='pdf',\n", + " transparent=False, bbox_inches=None, pad_inches=0.1,\n", + " frameon=None, metadata=None)\n", + " \n", + " fig_2.savefig('../data/{}_accuracy_performance.pdf'.format(plot_name), dpi=None, facecolor='w', edgecolor='w',\n", + " orientation='portrait', papertype=None, format='pdf',\n", + " transparent=False, bbox_inches=None, pad_inches=0.1,\n", + " frameon=None, metadata=None)\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":32: MatplotlibDeprecationWarning: \n", + "The frameon kwarg was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use facecolor instead.\n", + " fig_1.savefig('../data/{}_loss_performance.pdf'.format(plot_name), dpi=None, facecolor='w', edgecolor='w',\n", + ":37: MatplotlibDeprecationWarning: \n", + "The frameon kwarg was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use facecolor instead.\n", + " fig_2.savefig('../data/{}_accuracy_performance.pdf'.format(plot_name), dpi=None, facecolor='w', edgecolor='w',\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_result_graphs('problem_model', result_dict, keys_to_plot=['VGG_38', 'VGG_08'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/res/fprop-bprop-block-diagram.pdf b/notebooks/res/fprop-bprop-block-diagram.pdf new file mode 100644 index 00000000..6c5f0e08 Binary files /dev/null and b/notebooks/res/fprop-bprop-block-diagram.pdf differ diff --git a/notebooks/res/fprop-bprop-block-diagram.png b/notebooks/res/fprop-bprop-block-diagram.png new file mode 100644 index 00000000..17f6a8b2 Binary files /dev/null and b/notebooks/res/fprop-bprop-block-diagram.png differ diff --git a/notebooks/res/fprop-bprop-block-diagram.tex b/notebooks/res/fprop-bprop-block-diagram.tex new file mode 100644 index 00000000..d2c2c7b7 --- /dev/null +++ b/notebooks/res/fprop-bprop-block-diagram.tex @@ -0,0 +1,65 @@ +\documentclass[tikz]{standalone} + +\usepackage{amsmath} +\usepackage{tikz} +\usetikzlibrary{arrows} +\usetikzlibrary{calc} +\usepackage{ifthen} + +\newcommand{\vct}[1]{\boldsymbol{#1}} +\newcommand{\pd}[2]{\frac{\partial #1}{\partial #2}} + +\tikzstyle{fprop} = [draw,fill=blue!20,minimum size=2em,align=center] +\tikzstyle{bprop} = [draw,fill=red!20,minimum size=2em,align=center] + +\begin{document} + +\begin{tikzpicture}[xscale=1.75] % + % define number of layers + \def\nl{2}; + % model input + \node at (0, 0) (input) {$\vct{x}$}; + % draw fprop through model layers + \foreach \l in {0,...,\nl} { + \node[fprop] at (2 * \l + 1, 0) (fprop\l) {\texttt{layers[\l]} \\ \texttt{.fprop}}; + \ifthenelse{\l > 0}{ + \node at (2 * \l, 0) (hidden\l) {$\vct{h}_\l$}; + \draw[->] (hidden\l) -- (fprop\l); + \draw[->] let \n1={\l - 1} in (fprop\n1) -- (hidden\l); + }{ + \draw[->] (input) -- (fprop\l); + } + } + % model output + \node at (2 * \nl + 2, 0) (output) {$\mathbf{y}$}; + % error function + \node[fprop] at (2 * \nl + 3, 0) (errorfunc) {\texttt{error}}; + % error value + \node at (2 * \nl + 3, -1) (error) {$\bar{E}$}; + % targets + \node at (2 * \nl + 4, -1) (tgt) {$\vct{t}$}; + % error gradient + \node[bprop] at (2 * \nl + 3, -2) (errorgrad) {\texttt{error} \\ \texttt{.grad}}; + % gradient wrt outputs + \node at (2 * \nl + 2, -2) (gradoutput) {$\pd{\bar{E}}{\vct{y}}$}; + \draw[->] (fprop\nl) -- (output); + \draw[->] (output) -- (errorfunc); + \draw[->] (errorfunc) -- (error); + \draw[->] (error) -- (errorgrad); + \draw[->] (errorgrad) -- (gradoutput); + \draw[->] (tgt) |- (errorfunc); + \draw[->] (tgt) |- (errorgrad); + \foreach \l in {0,...,\nl} { + \node[bprop] at (2 * \l + 1, -2) (bprop\l) {\texttt{layers[\l]} \\ \texttt{.bprop}}; + \ifthenelse{\l > 0}{ + \node at (2 * \l, -2) (grad\l) {$\pd{\bar{E}}{\vct{h}_\l}$}; + \draw[<-] (grad\l) -- (bprop\l); + \draw[<-] let \n1={\l - 1} in (bprop\n1) -- (grad\l); + }{} + } + \node at (0, -2) (gradinput) {$\pd{\bar{E}}{\vct{x}}$}; + \draw[->] (bprop0) -- (gradinput); + \draw[->] (gradoutput) -- (bprop\nl); +\end{tikzpicture} + +\end{document} \ No newline at end of file diff --git a/notes/environment-set-up.md b/notes/environment-set-up.md index eff30e2f..fb9ed953 100644 --- a/notes/environment-set-up.md +++ b/notes/environment-set-up.md @@ -25,7 +25,7 @@ the School of Informatics [DICE desktop](http://computing.help.inf.ed.ac.uk/dice should be able to used on other Linux distributions such as Ubuntu and Linux Mint with minimal adjustments. For those wishing to install on a personal Windows or OSX machine, the initial instructions for setting up Conda will -differ slightly - you should instead select the relevant installer for your system from [here](https://docs.conda.io/en/latest/miniconda.html) and following the corresponding installation instructions from [here](https://conda.io/projects/conda/en/latest/user-guide/install/index.html). After Conda is installed the [remaining instructions](#creating-the-conda-environment) should be broadly the same across different systems. +differ slightly - you should instead select the relevant installer for your system from [here](http://conda.pydata.org/miniconda.html) and following the corresponding installation instructions from [here](http://conda.pydata.org/docs/install/quick.html). After Conda is installed the [remaining instructions](#creating-the-conda-environment) should be broadly the same across different systems. *Note: Although we are happy for you to additionally set up an environment on a personal machine, you should still set up a DICE environment now as this will make sure you are able to use shared computing resources later in the course. Also although we have tried to note when the required commands will differ on non-DICE systems, these instructions have only been tested on DICE and we will not be able to offer any support in labs on getting set up on a non-DICE system.* @@ -273,7 +273,7 @@ This will change the code in the working directory to the current state of the c You should make sure you are on the first lab branch now by running: ``` -git checkout mlp2022-23/lab1 +git checkout mlp2020-21/lab1 ``` ## 6. Installing the `mlp` Python package @@ -302,11 +302,10 @@ Note that after the first time a Python module is loaded into an interpreter ins import mlp ``` -Running the `import` statement any further times will have no effect even if the underlying module code has been changed. To reload an already imported module we instead need to use the [`importlib.reload`](https://docs.python.org/3/library/importlib.html#importlib.reload) function, e.g. +Running the `import` statement any further times will have no effect even if the underlying module code has been changed. To reload an already imported module we instead need to use the [`reload`](https://docs.python.org/2.7/library/functions.html#reload) function, e.g. ``` -import importlib -importlib.reload(mlp) +reload(mlp) ``` **Note: To be clear as this has caused some confusion in previous labs the above `import ...` / `reload(...)` statements should NOT be run directly in a bash terminal. They are examples Python statements - you could run them in a terminal by first loading a Python interpreter using:** @@ -371,7 +370,7 @@ Below are instructions for setting up the environment without additional explana --- -Start a new bash terminal. Download the latest 64-bit Python 3.9 Miniconda install script: +Start a new bash terminal. Download the latest 64-bit Python 2.7 Miniconda install script: ``` wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh @@ -433,7 +432,7 @@ Make sure we are on the first lab branch ``` cd ~/mlpractical -git checkout mlp2022-23/lab1 +git checkout mlp2020-21/lab1 ``` Install the `mlp` package in the environment in develop mode diff --git a/notes/getting-started-in-a-lab.md b/notes/getting-started-in-a-lab.md index 4c1a6ac1..e952e876 100644 --- a/notes/getting-started-in-a-lab.md +++ b/notes/getting-started-in-a-lab.md @@ -34,15 +34,15 @@ We are now ready to fetch any updated code from the remote repository on Github. git fetch origin ``` -This should display a message indicate a new branch has been found and fetched, named `origin/mlp2022-23/lab[n]` where `[n]` is the relevant lab number e.g. `origin/mlp2022-23/lab2` for the second lab. +This should display a message indicate a new branch has been found and fetched, named `origin/mlp2018-9/lab[n]` where `[n]` is the relevant lab number e.g. `origin/mlp2018-9/lab2` for the second lab. We now need to create and checkout a new local branch from the remote branch fetched above. This can be done by running ``` -git checkout -b lab[n] origin/mlp2022-23/lab[n] +git checkout -b lab[n] origin/mlp2018-9/lab[n] ``` -where again `lab[n]` corresponds to the relevant lab number fetched above e.g. `lab2`. This command creates a new local branch named `lab[n]` from the fetched branch on the remote repository `origin/mlp2022-23/lab[n]`. +where again `lab[n]` corresponds to the relevant lab number fetched above e.g. `lab2`. This command creates a new local branch named `lab[n]` from the fetched branch on the remote repository `origin/mlp2018-9/lab[n]`. Inside the `notebooks` directory there should new be a new notebook for today's lab. The notebook for the previous lab will now also have proposed solutions filled in. diff --git a/notes/google_cloud_setup.md b/notes/google_cloud_setup.md new file mode 100644 index 00000000..874595f0 --- /dev/null +++ b/notes/google_cloud_setup.md @@ -0,0 +1,114 @@ +# Google Cloud Usage Tutorial +This document has been created to help you setup a google cloud instance to be used for the MLP course using the student credit the course has acquired. +This document is non-exhaustive and many more useful information is available on the [google cloud documentation page](https://cloud.google.com/docs/). +For any question you might have, that is not covered here, a quick google search should get you what you need. Anything in the official google cloud docs should be very helpful. + +| WARNING: Read those instructions carefully. You will be given 50$ worth of credits and you will need to manage them properly | +| ---------------------------------------------------------------------------------------------------------------------------- | + + +### To create your account and start a project funded by the student credit +1. Login with your preferred gmail id to [google cloud console](https://cloud.google.com/), click on Select a Project on the left hand side of the search bar on top of the page and then click on New Project on the right hand side of the Pop-Up. +Name your project sxxxxxxx-MLPractical - replacing the sxxxxxxx with your student number. Make sure you are on this project before following the next steps. +2. Get your coupon by following the instructions in the coupon retrieval link that you received. +3. Once you receive your coupon, follow the email instructions to add your coupon to your account. +4. Once you have added your coupon, join the [MLPractical GCP Google Group](https://groups.google.com/forum/#!forum/mlpractical_gcp) using the same Google account you used to redeem your coupon. This ensures access to the shared disk images. +5. Make sure that the financial source for your project is the MLPractical credit by clicking the 3 lines icon at the top left corner and then clicking billing -> go to linked billing account. +6. If it's not set to the MLPractical credits then set it by going to billing -> manage billing accounts -> My projects. Click the 3 dots under the Actions column for the relevant project and click change billing account. Select the MLPractical credit from your coupon. +6. Start the project + +### To create an instance +1. Click the button with the three lines at the top left corner. +2. Click ```Compute Engine```. You might be asked to activate it. +3. On the left hand side, select ```VM Instances```. +4. Click the ```CREATE INSTANCE``` button at the top of the window. +5. Name the instance ```mlpractical-1``` +6. Select region to be ```us-west1(Oregon)``` and zone to be ```us-west-1b``` (there are other suitable regions however this one has K80s available right now so we went with this one, feel free to find something else if for some reason you need to, but it is recommended ro run on K80 GPUs.) +7. In Machine Configuration, select ```GPU``` machine family. +8. Select NVIDIA Tesla K80. Those are the cheapest one, be careful as others can cost up to 8 times more to run +9. Series and in Machine type select ```2 vCPUs``` with ```7.5Gb memory```. +10. Under ```Boot disk```, click change. +11. On the new menu that appears (under public images), select the ```Deep Learning on Linux``` operating system, with the ```Pytorch 1.10, no-XLA``` version, then click select at the bottom. +12. You should consider going into the ```Advanced Options``` drop down menu at the bottom and enable ```Spot``` under ```VM provisioning model``` in the management tab. Using this option will be helpful if you're running low on credits. +13. Click ```Create```. Your instance should be ready in a minute or two. +14. If your instance failed to create due to the following error - ```Quota 'GPUS_ALL_REGIONS' exceeded. Limit: 0.0 globally.```, type ```quota``` in the search bar then click ```All quotas``` +15. Search for 'GPUS_ALL_REGIONS' in the filters +16. Tick in the box next to Global and then Click ```Edit Quotas``` in the top bar. +17. This will open a box in the right side corner asking for your details. Fill in those and then click Next. +18. Put your New Limit as ```1``` and in the description you can mention you need GPU for machine learning coursework. And then Send Request. +19. You will receive a confirmation email with your Quota Limit increased. This may take some minutes. +20. After the confirmation email, you can recheck the GPU(All Regions) Quota Limit being set to 1. This usually shows up in 10-15 minutes after the confirmation email. +21. Retry making the VM instance again as before and you should have your instance now. + + +#### Note +Be careful to select 1 x K80 GPU (P100s and P4s are 5x more expensive) + +You only have $50 dollars worth of credit, which should be about 125 hours of GPU usage on a K80. + + +### To login into your instance via terminal: +1. In a DICE terminal window ```conda activate mlp``` +2. Download the `gcloud` toolkit using ```curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-365.0.0-linux-x86_64.tar.gz``` +3. Install the `gcloud` toolkit using ```tar zxvf google-cloud-sdk-365.0.0-linux-x86_64.tar.gz; bash google-cloud-sdk/install.sh```. +**Note**: You will be asked to provide a passphrase to generate your local key, simply use a password of your choice. There might be some Yes/No style questions as well, choose yes, when that happens. + +4. Reset your terminal using ```reset; source ~/.bashrc```. Then authorize the current machine to access your nodes run ```gcloud auth login```. This will authenticate your google account login. +3. Follow the prompts to get a token for your current machine. +4. Run ```gcloud config set project PROJECT_ID``` where you replace `PROJECT-ID` with your project ID, you can find that in the projects drop down menu on the top of the Google Compute Engine window; this sets the current project as the active one +5. In your compute engine window, in the line for the instance that you have started (`mlpractical-1`), click on the downward arrow next to ```ssh```. Choose ```View gcloud command```. Copy the command to your terminal and press enter. +6. Add a password for your ssh-key (and remember it!). +7. Re-enter password (which will unlock your ssh-key) when prompted. +8. On your first login, you will be asked if you want to install nvidia drivers, agree and make sure the installation runs well. +9. Run ```nvidia-smi``` to confirm that the GPU can be found. This should report 1 Tesla K80 GPU. if not, the driver might have failed to install. Logout and retry. +10. Well done, you are now in your instance! When you login you may see an error of the form `Unable to set persistence mode for GPU 00000000:00:04.0: Insufficient Permissions` - you should be able to ignore this. The instance on the first startup should check for the gpu cuda drivers and since they are not there, it will install them. This will only happen once on your first login. Once the installation is finished you are ready to use the instance for your coursework. +11. Clone a fresh mlpractical repository, and checkout branch `coursework2`: + +``` +git clone https://github.com/VICO-UoE/mlpractical.git ~/mlpractical +cd ~/mlpractical +git checkout -b coursework2 origin/mlp2021-22/coursework2 +python setup.py develop +``` + +Then, to test PyTorch running on the GPU, run this script that trains a small convolutional network (7 conv layers + 1 linear layer, 32 filters) on CIFAR100: + +``` +python pytorch_mlp_framework/train_evaluate_image_classification_system.py --batch_size 100 --seed 0 --num_filters 32 --num_stages 3 --num_blocks_per_stage 0 --experiment_name VGG_08_experiment --use_gpu True --num_classes 100 --block_type 'conv_block' --continue_from_epoch -1 +``` + +You should be able to see an experiment running, using the GPU. It should be doing about 26-30 it/s (iterations per second). You can stop it when ever you like using `ctrl-c`. + +If all the above matches what’s stated then you should be ready to run your coursework jobs. + +### Remember to ```stop``` your instance when not using it. You pay for the time you use the machine, not for the computational cycles used. +To stop the instance go to `Compute Engine -> VM instances` on the Google Cloud Platform, slect the instance and click ```Stop```. + +#### Future ssh access: +To access the instance in the future simply run the `gcloud` command you copied from the google compute engine instance page. + + +## Copying data to and from an instance + +Please look at the [google docs page on copying data](https://cloud.google.com/filestore/docs/copying-data). + +To copy from local machine to a google instance, have a look at this [stackoverflow post](https://stackoverflow.com/questions/27857532/rsync-to-google-compute-engine-instance-from-jenkins). + +## Running experiments over ssh: + +If ssh fails while running an experiment, then the experiment is normally killed. +To avoid this use the command ```screen```. It creates a process of the current session that keeps running whether + a user is signed in or not. + +The basics of using screen is to use ```screen``` to create a new session, then to enter an existing session you use: +```screen -ls``` +To get a list of all available sessions. Then once you find the one you want use: +```screen -d -r screen_id``` +Replacing screen_id with the id of the session you want to enter. + +While in a session, you can use +- ```ctrl+a+esc``` To pause process and be able to scroll +- ```ctrl+a+d``` to detach from session while leaving it running (once you detach you can reattach using ```screen -r```) +- ```ctrl+a+n``` to see the next session. +- ```ctrl+a+c``` to create a new session + diff --git a/notes/pytorch-experiment-framework.md b/notes/pytorch-experiment-framework.md new file mode 100644 index 00000000..b3e5b782 --- /dev/null +++ b/notes/pytorch-experiment-framework.md @@ -0,0 +1,134 @@ +# Pytorch Experiment Framework + +## What does this framework do? +The Pytorch experiment framework located in ```mlp/pytorch_mlp_framework``` includes tooling for building an array of deep neural networks, +including fully connected and convolutional networks. In addition, it also includes tooling for experiment running, +metric handling and storage, model weight storage, checkpointing (allowing continuation from previous saved point), as +well as taking care of keeping track of the best validation model which is then used as the end to produce test set evaluation metrics. + +## Why do we need it? +It serves two main purposes. The first, is to allow you an easy, worry-free transition into using Pytorch for experiments + in your coursework. The second, is to teach you good coding practices for building and running deep learning experiments + using Pytorch. The framework comes fully loaded with tooling that can keep track of relevant metrics, save models, resume from previous saved states and + even automatically choose the best validation model for test set evaluation. We include documentation and comments in almost + every single line of code in the framework, to help you maximize your learning. The code style itself, can be used for + learning good programming practices in structuring your code in a modular, readable and computationally efficient manner that minimizes chances of user-error. + +## Installation + +First thing you have to do is activate your conda MLP environment. + +### GPU version on Google Compute Engine +For usage on google cloud, the disk image we provide comes pre-loaded with all the packages you need to run the Pytorch +experiment framework, including Pytorch itself. Thus when you created an instance and setup your environment, everything you need for this framework was installed, thus removing the need for you to install Pytorch. + + + +### CPU version on DICE (or other local machine) + +If you do not have your MLP conda environment installed on your current machine +please follow the instructions in notes/environment-set-up.md. Once your mlp conda environment is activated, please go to +[Pytorch's installation page](https://pytorch.org/get-started/locally/) and take some time to choose the right Pytorch version for your setup (taking care to choose CPU/GPU version depending on what hardward you have available). + +For example, on DICE you can install the CPU version using the command: +``` +conda install pytorch-cpu torchvision-cpu -c pytorch +``` + +Once Pytorch is installed in your mlp conda enviroment, you can start using the framework. The framework has been built +to allow you to control your experiment hyperparameters directly from the command line, by using command line argument parsing. + +## Using the framework + +You can get a list of all available hyperparameters and arguments by using: +``` +python pytorch_mlp_framework/train_evaluate_image_classification_system.py -h +``` + +The -h at the end is short for --help, which presents a list with all possible arguments next to a description of what they modify in the setup. +Once you execute that command, you should be able to see the following list: + +``` +Welcome to the MLP course's Pytorch training and inference helper script + +optional arguments: + -h, --help show this help message and exit + --batch_size [BATCH_SIZE] + Batch_size for experiment + --continue_from_epoch [CONTINUE_FROM_EPOCH] + Which epoch to continue from. + If -2, continues from where it left off + If -1, starts from scratch + if >=0, continues from given epoch + --seed [SEED] Seed to use for random number generator for experiment + --image_num_channels [IMAGE_NUM_CHANNELS] + The channel dimensionality of our image-data + --image_height [IMAGE_HEIGHT] + Height of image data + --image_width [IMAGE_WIDTH] + Width of image data + --num_stages [NUM_STAGES] + Number of convolutional stages in the network. A stage + is considered a sequence of convolutional layers where + the input volume remains the same in the spacial + dimension and is always terminated by a dimensionality + reduction stage + --num_blocks_per_stage [NUM_BLOCKS_PER_STAGE] + Number of convolutional blocks in each stage, not + including the reduction stage. A convolutional block + is made up of two convolutional layers activated using + the leaky-relu non-linearity + --num_filters [NUM_FILTERS] + Number of convolutional filters per convolutional + layer in the network (excluding dimensionality + reduction layers) + --num_epochs [NUM_EPOCHS] + The experiment's epoch budget + --num_classes [NUM_CLASSES] + The experiment's epoch budget + --experiment_name [EXPERIMENT_NAME] + Experiment name - to be used for building the + experiment folder + --use_gpu [USE_GPU] A flag indicating whether we will use GPU acceleration + or not + --weight_decay_coefficient [WEIGHT_DECAY_COEFFICIENT] + Weight decay to use for Adam + --block_type BLOCK_TYPE + Type of convolutional blocks to use in our network + (This argument will be useful in running experiments + to debug your network) + +``` + +For example, to run a simple experiment using a 7-layer convolutional network on the CPU you can run: + +``` +python pytorch_mlp_framework/train_evaluate_image_classification_system.py --batch_size 100 --seed 0 --num_filters 32 --num_stages 3 --num_blocks_per_stage 0 --experiment_name VGG_07 --num_classes 100 --block_type 'conv_block' --weight_decay_coefficient 0.00000 --use_gpu False +``` + +Your experiment should begin running. + +Your experiments statistics and model weights are saved in the directory tutorial_exp_1/ under tutorial_exp_1/logs and +tutorial_exp_1/saved_models. + + +To run on a GPU on Google Compute Engine the command would be: +``` +python pytorch_mlp_framework/train_evaluate_image_classification_system.py --batch_size 100 --seed 0 --num_filters 32 --num_stages 3 --num_blocks_per_stage 0 --experiment_name VGG_07 --num_classes 100 --block_type 'conv_block' --weight_decay_coefficient 0.00000 --use_gpu True + +``` + +We have also provided the exact scripts we used to run the experiments of VGG07 and VGG37 as shown in the coursework spec inside the files: +- run_vgg_08_default.sh +- run_vgg_38_default.sh + +**However, remember, if you want to reuse those scripts for your own investigations, change the experiment name and seed. +If you do not change the name, the old folders will be overwritten.** + +## So, where can I ask more questions and find more information on Pytorch and what it can do? + +First course of action should be to search the web and then to refer to the Pytorch [documentation](https://pytorch.org/docs/stable/index.html), + [tutorials](https://pytorch.org/tutorials/) and [github](https://github.com/pytorch/pytorch) sites. + + If you still can't get an answer to your question then as always, post on Piazza and/or come to the lab sessions. + diff --git a/notes/quota-issue.md b/notes/quota-issue.md index cd1cb7c7..db09687a 100644 --- a/notes/quota-issue.md +++ b/notes/quota-issue.md @@ -17,13 +17,13 @@ this should clean out the old partially installed packages and reinstall them fr Your homespace can be accessed from any Informatics computer running DICE (e.g. any of the computers in the [Forrest Hill labs](http://web.inf.ed.ac.uk/infweb/student-services/ito/students/year2/student-support/facilities/computer-labs) which are open-access outside of booked lab sessions or for those who know how to use SSH you can [log in remotely](http://computing.help.inf.ed.ac.uk/external-login)). You can therefore finish your environment set up prior to the next lab if you want though it is also fine to wait till the beginning of the next lab (it will take around 5 minutes to complete the installation). -At this point assuming you ran through the rest of the instructions to clone the Git repository to your homespace and install the `mlp` package (i.e. the instructions from [here](https://github.com/VICO-UoE/mlpractical/tree/mlp2016-7/lab1/notes/environment-set-up.md#getting-the-course-code-and-a-short-introduction-to-git) on-wards), you should have a fully working environment. +At this point assuming you ran through the rest of the instructions to clone the Git repository to your homespace and install the `mlp` package (i.e. the instructions from [here](https://github.com/CSTR-Edinburgh/mlpractical/blob/mlp2016-7/lab1/environment-set-up.md#getting-the-course-code-and-a-short-introduction-to-git) on-wards), you should have a fully working environment. Once your environment is set up in all future labs you will only need to activate it to get started. So at the beginning of each subsequent lab we will ask you to do something like the following ``` source activate mlp # Activate the mlp environment cd ~/mlpractical # Change the current directory to mlpractical repository -git checkout mlp2022-23/lab[...] # Checkout the branch for this week's lab +git checkout mlp2017-8/lab[...] # Checkout the branch for this week's lab jupyter notebook # Launch the notebook server ``` diff --git a/pytorch_mlp_framework/arg_extractor.py b/pytorch_mlp_framework/arg_extractor.py new file mode 100644 index 00000000..039f2554 --- /dev/null +++ b/pytorch_mlp_framework/arg_extractor.py @@ -0,0 +1,53 @@ +import argparse + + +def str2bool(v): + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def get_args(): + """ + Returns a namedtuple with arguments extracted from the command line. + :return: A namedtuple with arguments + """ + parser = argparse.ArgumentParser( + description='Welcome to the MLP course\'s Pytorch training and inference helper script') + + parser.add_argument('--batch_size', nargs="?", type=int, default=100, help='Batch_size for experiment') + parser.add_argument('--continue_from_epoch', nargs="?", type=int, default=-1, help='Epoch you want to continue training from while restarting an experiment') + parser.add_argument('--seed', nargs="?", type=int, default=7112018, + help='Seed to use for random number generator for experiment') + parser.add_argument('--image_num_channels', nargs="?", type=int, default=3, + help='The channel dimensionality of our image-data') + parser.add_argument('--image_height', nargs="?", type=int, default=32, help='Height of image data') + parser.add_argument('--image_width', nargs="?", type=int, default=32, help='Width of image data') + parser.add_argument('--num_stages', nargs="?", type=int, default=3, + help='Number of convolutional stages in the network. A stage is considered a sequence of ' + 'convolutional layers where the input volume remains the same in the spacial dimension and' + ' is always terminated by a dimensionality reduction stage') + parser.add_argument('--num_blocks_per_stage', nargs="?", type=int, default=5, + help='Number of convolutional blocks in each stage, not including the reduction stage.' + ' A convolutional block is made up of two convolutional layers activated using the ' + ' leaky-relu non-linearity') + parser.add_argument('--num_filters', nargs="?", type=int, default=16, + help='Number of convolutional filters per convolutional layer in the network (excluding ' + 'dimensionality reduction layers)') + parser.add_argument('--num_epochs', nargs="?", type=int, default=100, help='Total number of epochs for model training') + parser.add_argument('--num_classes', nargs="?", type=int, default=100, help='Number of classes in the dataset') + parser.add_argument('--experiment_name', nargs="?", type=str, default="exp_1", + help='Experiment name - to be used for building the experiment folder') + parser.add_argument('--use_gpu', nargs="?", type=str2bool, default=True, + help='A flag indicating whether we will use GPU acceleration or not') + parser.add_argument('--weight_decay_coefficient', nargs="?", type=float, default=0, + help='Weight decay to use for Adam') + parser.add_argument('--block_type', type=str, default='conv_block', + help='Type of convolutional blocks to use in our network ' + '(This argument will be useful in running experiments to debug your network)') + args = parser.parse_args() + print(args) + return args diff --git a/pytorch_mlp_framework/experiment_builder.py b/pytorch_mlp_framework/experiment_builder.py new file mode 100644 index 00000000..c0b5380c --- /dev/null +++ b/pytorch_mlp_framework/experiment_builder.py @@ -0,0 +1,326 @@ +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import tqdm +import os +import numpy as np +import time + +from pytorch_mlp_framework.storage_utils import save_statistics +from matplotlib import pyplot as plt +import matplotlib +matplotlib.rcParams.update({'font.size': 8}) + +class ExperimentBuilder(nn.Module): + def __init__(self, network_model, experiment_name, num_epochs, train_data, val_data, + test_data, weight_decay_coefficient, use_gpu, continue_from_epoch=-1): + """ + Initializes an ExperimentBuilder object. Such an object takes care of running training and evaluation of a deep net + on a given dataset. It also takes care of saving per epoch models and automatically inferring the best val model + to be used for evaluating the test set metrics. + :param network_model: A pytorch nn.Module which implements a network architecture. + :param experiment_name: The name of the experiment. This is used mainly for keeping track of the experiment and creating and directory structure that will be used to save logs, model parameters and other. + :param num_epochs: Total number of epochs to run the experiment + :param train_data: An object of the DataProvider type. Contains the training set. + :param val_data: An object of the DataProvider type. Contains the val set. + :param test_data: An object of the DataProvider type. Contains the test set. + :param weight_decay_coefficient: A float indicating the weight decay to use with the adam optimizer. + :param use_gpu: A boolean indicating whether to use a GPU or not. + :param continue_from_epoch: An int indicating whether we'll start from scrach (-1) or whether we'll reload a previously saved model of epoch 'continue_from_epoch' and continue training from there. + """ + super(ExperimentBuilder, self).__init__() + + + self.experiment_name = experiment_name + self.model = network_model + + if torch.cuda.device_count() > 1 and use_gpu: + self.device = torch.cuda.current_device() + self.model.to(self.device) + self.model = nn.DataParallel(module=self.model) + print('Use Multi GPU', self.device) + elif torch.cuda.device_count() == 1 and use_gpu: + self.device = torch.cuda.current_device() + self.model.to(self.device) # sends the model from the cpu to the gpu + print('Use GPU', self.device) + else: + print("use CPU") + self.device = torch.device('cpu') # sets the device to be CPU + print(self.device) + + print('here') + + self.model.reset_parameters() # re-initialize network parameters + self.train_data = train_data + self.val_data = val_data + self.test_data = test_data + + print('System learnable parameters') + num_conv_layers = 0 + num_linear_layers = 0 + total_num_parameters = 0 + for name, value in self.named_parameters(): + print(name, value.shape) + if all(item in name for item in ['conv', 'weight']): + num_conv_layers += 1 + if all(item in name for item in ['linear', 'weight']): + num_linear_layers += 1 + total_num_parameters += np.prod(value.shape) + + print('Total number of parameters', total_num_parameters) + print('Total number of conv layers', num_conv_layers) + print('Total number of linear layers', num_linear_layers) + + self.optimizer = optim.Adam(self.parameters(), amsgrad=False, + weight_decay=weight_decay_coefficient) + self.learning_rate_scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, + T_max=num_epochs, + eta_min=0.00002) + # Generate the directory names + self.experiment_folder = os.path.abspath(experiment_name) + self.experiment_logs = os.path.abspath(os.path.join(self.experiment_folder, "result_outputs")) + self.experiment_saved_models = os.path.abspath(os.path.join(self.experiment_folder, "saved_models")) + + # Set best models to be at 0 since we are just starting + self.best_val_model_idx = 0 + self.best_val_model_acc = 0. + + if not os.path.exists(self.experiment_folder): # If experiment directory does not exist + os.mkdir(self.experiment_folder) # create the experiment directory + os.mkdir(self.experiment_logs) # create the experiment log directory + os.mkdir(self.experiment_saved_models) # create the experiment saved models directory + + self.num_epochs = num_epochs + self.criterion = nn.CrossEntropyLoss().to(self.device) # send the loss computation to the GPU + + if continue_from_epoch == -2: # if continue from epoch is -2 then continue from latest saved model + self.state, self.best_val_model_idx, self.best_val_model_acc = self.load_model( + model_save_dir=self.experiment_saved_models, model_save_name="train_model", + model_idx='latest') # reload existing model from epoch and return best val model index + # and the best val acc of that model + self.starting_epoch = int(self.state['model_epoch']) + + elif continue_from_epoch > -1: # if continue from epoch is greater than -1 then + self.state, self.best_val_model_idx, self.best_val_model_acc = self.load_model( + model_save_dir=self.experiment_saved_models, model_save_name="train_model", + model_idx=continue_from_epoch) # reload existing model from epoch and return best val model index + # and the best val acc of that model + self.starting_epoch = continue_from_epoch + else: + self.state = dict() + self.starting_epoch = 0 + + def get_num_parameters(self): + total_num_params = 0 + for param in self.parameters(): + total_num_params += np.prod(param.shape) + + return total_num_params + + + def plot_func_def(self,all_grads, layers): + + + """ + Plot function definition to plot the average gradient with respect to the number of layers in the given model + :param all_grads: Gradients wrt weights for each layer in the model. + :param layers: Layer names corresponding to the model parameters + :return: plot for gradient flow + """ + plt.plot(all_grads, alpha=0.3, color="b") + plt.hlines(0, 0, len(all_grads)+1, linewidth=1, color="k" ) + plt.xticks(range(0,len(all_grads), 1), layers, rotation="vertical") + plt.xlim(xmin=0, xmax=len(all_grads)) + plt.xlabel("Layers") + plt.ylabel("Average Gradient") + plt.title("Gradient flow") + plt.grid(True) + plt.tight_layout() + + return plt + + + def plot_grad_flow(self, named_parameters): + """ + The function is being called in Line 298 of this file. + Receives the parameters of the model being trained. Returns plot of gradient flow for the given model parameters. + + """ + all_grads = [] + layers = [] + + """ + Complete the code in the block below to collect absolute mean of the gradients for each layer in all_grads with the layer names in layers. + """ + ######################################## + + + ######################################## + + + plt = self.plot_func_def(all_grads, layers) + + return plt + + + + + def run_train_iter(self, x, y): + + self.train() # sets model to training mode (in case batch normalization or other methods have different procedures for training and evaluation) + x, y = x.float().to(device=self.device), y.long().to( + device=self.device) # send data to device as torch tensors + out = self.model.forward(x) # forward the data in the model + + + loss = F.cross_entropy(input=out, target=y) # compute loss + + self.optimizer.zero_grad() # set all weight grads from previous training iters to 0 + loss.backward() # backpropagate to compute gradients for current iter loss + + self.learning_rate_scheduler.step(epoch=self.current_epoch) + self.optimizer.step() # update network parameters + _, predicted = torch.max(out.data, 1) # get argmax of predictions + accuracy = np.mean(list(predicted.eq(y.data).cpu())) # compute accuracy + return loss.cpu().data.numpy(), accuracy + + def run_evaluation_iter(self, x, y): + """ + Receives the inputs and targets for the model and runs an evaluation iterations. Returns loss and accuracy metrics. + :param x: The inputs to the model. A numpy array of shape batch_size, channels, height, width + :param y: The targets for the model. A numpy array of shape batch_size, num_classes + :return: the loss and accuracy for this batch + """ + self.eval() # sets the system to validation mode + x, y = x.float().to(device=self.device), y.long().to( + device=self.device) # convert data to pytorch tensors and send to the computation device + out = self.model.forward(x) # forward the data in the model + + loss = F.cross_entropy(input=out, target=y) # compute loss + + _, predicted = torch.max(out.data, 1) # get argmax of predictions + accuracy = np.mean(list(predicted.eq(y.data).cpu())) # compute accuracy + return loss.cpu().data.numpy(), accuracy + + def save_model(self, model_save_dir, model_save_name, model_idx, best_validation_model_idx, + best_validation_model_acc): + """ + Save the network parameter state and current best val epoch idx and best val accuracy. + :param model_save_name: Name to use to save model without the epoch index + :param model_idx: The index to save the model with. + :param best_validation_model_idx: The index of the best validation model to be stored for future use. + :param best_validation_model_acc: The best validation accuracy to be stored for use at test time. + :param model_save_dir: The directory to store the state at. + :param state: The dictionary containing the system state. + + """ + self.state['network'] = self.state_dict() # save network parameter and other variables. + self.state['best_val_model_idx'] = best_validation_model_idx # save current best val idx + self.state['best_val_model_acc'] = best_validation_model_acc # save current best val acc + torch.save(self.state, f=os.path.join(model_save_dir, "{}_{}".format(model_save_name, str( + model_idx)))) # save state at prespecified filepath + + def load_model(self, model_save_dir, model_save_name, model_idx): + """ + Load the network parameter state and the best val model idx and best val acc to be compared with the future val accuracies, in order to choose the best val model + :param model_save_dir: The directory to store the state at. + :param model_save_name: Name to use to save model without the epoch index + :param model_idx: The index to save the model with. + :return: best val idx and best val model acc, also it loads the network state into the system state without returning it + """ + state = torch.load(f=os.path.join(model_save_dir, "{}_{}".format(model_save_name, str(model_idx)))) + self.load_state_dict(state_dict=state['network']) + return state, state['best_val_model_idx'], state['best_val_model_acc'] + + def run_experiment(self): + """ + Runs experiment train and evaluation iterations, saving the model and best val model and val model accuracy after each epoch + :return: The summary current_epoch_losses from starting epoch to total_epochs. + """ + total_losses = {"train_acc": [], "train_loss": [], "val_acc": [], + "val_loss": []} # initialize a dict to keep the per-epoch metrics + for i, epoch_idx in enumerate(range(self.starting_epoch, self.num_epochs)): + epoch_start_time = time.time() + current_epoch_losses = {"train_acc": [], "train_loss": [], "val_acc": [], "val_loss": []} + self.current_epoch = epoch_idx + with tqdm.tqdm(total=len(self.train_data)) as pbar_train: # create a progress bar for training + for idx, (x, y) in enumerate(self.train_data): # get data batches + loss, accuracy = self.run_train_iter(x=x, y=y) # take a training iter step + current_epoch_losses["train_loss"].append(loss) # add current iter loss to the train loss list + current_epoch_losses["train_acc"].append(accuracy) # add current iter acc to the train acc list + pbar_train.update(1) + pbar_train.set_description("loss: {:.4f}, accuracy: {:.4f}".format(loss, accuracy)) + + with tqdm.tqdm(total=len(self.val_data)) as pbar_val: # create a progress bar for validation + for x, y in self.val_data: # get data batches + loss, accuracy = self.run_evaluation_iter(x=x, y=y) # run a validation iter + current_epoch_losses["val_loss"].append(loss) # add current iter loss to val loss list. + current_epoch_losses["val_acc"].append(accuracy) # add current iter acc to val acc lst. + pbar_val.update(1) # add 1 step to the progress bar + pbar_val.set_description("loss: {:.4f}, accuracy: {:.4f}".format(loss, accuracy)) + val_mean_accuracy = np.mean(current_epoch_losses['val_acc']) + if val_mean_accuracy > self.best_val_model_acc: # if current epoch's mean val acc is greater than the saved best val acc then + self.best_val_model_acc = val_mean_accuracy # set the best val model acc to be current epoch's val accuracy + self.best_val_model_idx = epoch_idx # set the experiment-wise best val idx to be the current epoch's idx + + for key, value in current_epoch_losses.items(): + total_losses[key].append(np.mean( + value)) # get mean of all metrics of current epoch metrics dict, to get them ready for storage and output on the terminal. + + save_statistics(experiment_log_dir=self.experiment_logs, filename='summary.csv', + stats_dict=total_losses, current_epoch=i, + continue_from_mode=True if (self.starting_epoch != 0 or i > 0) else False) # save statistics to stats file. + + # load_statistics(experiment_log_dir=self.experiment_logs, filename='summary.csv') # How to load a csv file if you need to + + out_string = "_".join( + ["{}_{:.4f}".format(key, np.mean(value)) for key, value in current_epoch_losses.items()]) + # create a string to use to report our epoch metrics + epoch_elapsed_time = time.time() - epoch_start_time # calculate time taken for epoch + epoch_elapsed_time = "{:.4f}".format(epoch_elapsed_time) + print("Epoch {}:".format(epoch_idx), out_string, "epoch time", epoch_elapsed_time, "seconds") + self.state['model_epoch'] = epoch_idx + self.save_model(model_save_dir=self.experiment_saved_models, + # save model and best val idx and best val acc, using the model dir, model name and model idx + model_save_name="train_model", model_idx=epoch_idx, + best_validation_model_idx=self.best_val_model_idx, + best_validation_model_acc=self.best_val_model_acc) + self.save_model(model_save_dir=self.experiment_saved_models, + # save model and best val idx and best val acc, using the model dir, model name and model idx + model_save_name="train_model", model_idx='latest', + best_validation_model_idx=self.best_val_model_idx, + best_validation_model_acc=self.best_val_model_acc) + + ################################################################ + ##### Plot Gradient Flow at each Epoch during Training ###### + print("Generating Gradient Flow Plot at epoch {}".format(epoch_idx)) + plt = self.plot_grad_flow(self.model.named_parameters()) + if not os.path.exists(os.path.join(self.experiment_saved_models, 'gradient_flow_plots')): + os.mkdir(os.path.join(self.experiment_saved_models, 'gradient_flow_plots')) + # plt.legend(loc="best") + plt.savefig(os.path.join(self.experiment_saved_models, 'gradient_flow_plots', "epoch{}.pdf".format(str(epoch_idx)))) + ################################################################ + + print("Generating test set evaluation metrics") + self.load_model(model_save_dir=self.experiment_saved_models, model_idx=self.best_val_model_idx, + # load best validation model + model_save_name="train_model") + current_epoch_losses = {"test_acc": [], "test_loss": []} # initialize a statistics dict + with tqdm.tqdm(total=len(self.test_data)) as pbar_test: # ini a progress bar + for x, y in self.test_data: # sample batch + loss, accuracy = self.run_evaluation_iter(x=x, + y=y) # compute loss and accuracy by running an evaluation step + current_epoch_losses["test_loss"].append(loss) # save test loss + current_epoch_losses["test_acc"].append(accuracy) # save test accuracy + pbar_test.update(1) # update progress bar status + pbar_test.set_description( + "loss: {:.4f}, accuracy: {:.4f}".format(loss, accuracy)) # update progress bar string output + + test_losses = {key: [np.mean(value)] for key, value in + current_epoch_losses.items()} # save test set metrics in dict format + save_statistics(experiment_log_dir=self.experiment_logs, filename='test_summary.csv', + # save test set metrics on disk in .csv format + stats_dict=test_losses, current_epoch=0, continue_from_mode=False) + + return total_losses, test_losses diff --git a/pytorch_mlp_framework/model_architectures.py b/pytorch_mlp_framework/model_architectures.py new file mode 100644 index 00000000..cfa991de --- /dev/null +++ b/pytorch_mlp_framework/model_architectures.py @@ -0,0 +1,340 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class FCCNetwork(nn.Module): + def __init__(self, input_shape, num_output_classes, num_filters, num_layers, use_bias=False): + """ + Initializes a fully connected network similar to the ones implemented previously in the MLP package. + :param input_shape: The shape of the inputs going in to the network. + :param num_output_classes: The number of outputs the network should have (for classification those would be the number of classes) + :param num_filters: Number of filters used in every fcc layer. + :param num_layers: Number of fcc layers (excluding dim reduction stages) + :param use_bias: Whether our fcc layers will use a bias. + """ + super(FCCNetwork, self).__init__() + # set up class attributes useful in building the network and inference + self.input_shape = input_shape + self.num_filters = num_filters + self.num_output_classes = num_output_classes + self.use_bias = use_bias + self.num_layers = num_layers + # initialize a module dict, which is effectively a dictionary that can collect layers and integrate them into pytorch + self.layer_dict = nn.ModuleDict() + # build the network + self.build_module() + + def build_module(self): + print("Building basic block of FCCNetwork using input shape", self.input_shape) + x = torch.zeros((self.input_shape)) + + out = x + out = out.view(out.shape[0], -1) + # flatten inputs to shape (b, -1) where -1 is the dim resulting from multiplying the + # shapes of all dimensions after the 0th dim + + for i in range(self.num_layers): + self.layer_dict['fcc_{}'.format(i)] = nn.Linear(in_features=out.shape[1], # initialize a fcc layer + out_features=self.num_filters, + bias=self.use_bias) + + out = self.layer_dict['fcc_{}'.format(i)](out) # apply ith fcc layer to the previous layers outputs + out = F.relu(out) # apply a ReLU on the outputs + + self.logits_linear_layer = nn.Linear(in_features=out.shape[1], # initialize the prediction output linear layer + out_features=self.num_output_classes, + bias=self.use_bias) + out = self.logits_linear_layer(out) # apply the layer to the previous layer's outputs + print("Block is built, output volume is", out.shape) + return out + + def forward(self, x): + """ + Forward prop data through the network and return the preds + :param x: Input batch x a batch of shape batch number of samples, each of any dimensionality. + :return: preds of shape (b, num_classes) + """ + out = x + out = out.view(out.shape[0], -1) + # flatten inputs to shape (b, -1) where -1 is the dim resulting from multiplying the + # shapes of all dimensions after the 0th dim + + for i in range(self.num_layers): + out = self.layer_dict['fcc_{}'.format(i)](out) # apply ith fcc layer to the previous layers outputs + out = F.relu(out) # apply a ReLU on the outputs + + out = self.logits_linear_layer(out) # apply the layer to the previous layer's outputs + return out + + def reset_parameters(self): + """ + Re-initializes the networks parameters + """ + for item in self.layer_dict.children(): + item.reset_parameters() + + self.logits_linear_layer.reset_parameters() + + +class EmptyBlock(nn.Module): + def __init__(self, input_shape=None, num_filters=None, kernel_size=None, padding=None, bias=None, dilation=None, + reduction_factor=None): + super(EmptyBlock, self).__init__() + + self.num_filters = num_filters + self.kernel_size = kernel_size + self.input_shape = input_shape + self.padding = padding + self.bias = bias + self.dilation = dilation + + self.build_module() + + def build_module(self): + self.layer_dict = nn.ModuleDict() + x = torch.zeros(self.input_shape) + self.layer_dict['Identity'] = nn.Identity() + + def forward(self, x): + out = x + + out = self.layer_dict['Identity'].forward(out) + + return out + + +class EntryConvolutionalBlock(nn.Module): + def __init__(self, input_shape, num_filters, kernel_size, padding, bias, dilation): + super(EntryConvolutionalBlock, self).__init__() + + self.num_filters = num_filters + self.kernel_size = kernel_size + self.input_shape = input_shape + self.padding = padding + self.bias = bias + self.dilation = dilation + + self.build_module() + + def build_module(self): + self.layer_dict = nn.ModuleDict() + x = torch.zeros(self.input_shape) + out = x + + self.layer_dict['conv_0'] = nn.Conv2d(in_channels=out.shape[1], out_channels=self.num_filters, bias=self.bias, + kernel_size=self.kernel_size, dilation=self.dilation, + padding=self.padding, stride=1) + + out = self.layer_dict['conv_0'].forward(out) + self.layer_dict['bn_0'] = nn.BatchNorm2d(num_features=out.shape[1]) + out = F.leaky_relu(self.layer_dict['bn_0'].forward(out)) + + print(out.shape) + + def forward(self, x): + out = x + + out = self.layer_dict['conv_0'].forward(out) + out = F.leaky_relu(self.layer_dict['bn_0'].forward(out)) + + return out + + +class ConvolutionalProcessingBlock(nn.Module): + def __init__(self, input_shape, num_filters, kernel_size, padding, bias, dilation): + super(ConvolutionalProcessingBlock, self).__init__() + + self.num_filters = num_filters + self.kernel_size = kernel_size + self.input_shape = input_shape + self.padding = padding + self.bias = bias + self.dilation = dilation + + self.build_module() + + def build_module(self): + self.layer_dict = nn.ModuleDict() + x = torch.zeros(self.input_shape) + out = x + + self.layer_dict['conv_0'] = nn.Conv2d(in_channels=out.shape[1], out_channels=self.num_filters, bias=self.bias, + kernel_size=self.kernel_size, dilation=self.dilation, + padding=self.padding, stride=1) + + out = self.layer_dict['conv_0'].forward(out) + out = F.leaky_relu(out) + + self.layer_dict['conv_1'] = nn.Conv2d(in_channels=out.shape[1], out_channels=self.num_filters, bias=self.bias, + kernel_size=self.kernel_size, dilation=self.dilation, + padding=self.padding, stride=1) + + out = self.layer_dict['conv_1'].forward(out) + out = F.leaky_relu(out) + + print(out.shape) + + def forward(self, x): + out = x + + out = self.layer_dict['conv_0'].forward(out) + out = F.leaky_relu(out) + + out = self.layer_dict['conv_1'].forward(out) + out = F.leaky_relu(out) + + return out + + +class ConvolutionalDimensionalityReductionBlock(nn.Module): + def __init__(self, input_shape, num_filters, kernel_size, padding, bias, dilation, reduction_factor): + super(ConvolutionalDimensionalityReductionBlock, self).__init__() + + self.num_filters = num_filters + self.kernel_size = kernel_size + self.input_shape = input_shape + self.padding = padding + self.bias = bias + self.dilation = dilation + self.reduction_factor = reduction_factor + self.build_module() + + def build_module(self): + self.layer_dict = nn.ModuleDict() + x = torch.zeros(self.input_shape) + out = x + + self.layer_dict['conv_0'] = nn.Conv2d(in_channels=out.shape[1], out_channels=self.num_filters, bias=self.bias, + kernel_size=self.kernel_size, dilation=self.dilation, + padding=self.padding, stride=1) + + out = self.layer_dict['conv_0'].forward(out) + out = F.leaky_relu(out) + + out = F.avg_pool2d(out, self.reduction_factor) + + self.layer_dict['conv_1'] = nn.Conv2d(in_channels=out.shape[1], out_channels=self.num_filters, bias=self.bias, + kernel_size=self.kernel_size, dilation=self.dilation, + padding=self.padding, stride=1) + + out = self.layer_dict['conv_1'].forward(out) + out = F.leaky_relu(out) + + print(out.shape) + + def forward(self, x): + out = x + + out = self.layer_dict['conv_0'].forward(out) + out = F.leaky_relu(out) + + out = F.avg_pool2d(out, self.reduction_factor) + + out = self.layer_dict['conv_1'].forward(out) + out = F.leaky_relu(out) + + return out + + +class ConvolutionalNetwork(nn.Module): + def __init__(self, input_shape, num_output_classes, num_filters, + num_blocks_per_stage, num_stages, use_bias=False, processing_block_type=ConvolutionalProcessingBlock, + dimensionality_reduction_block_type=ConvolutionalDimensionalityReductionBlock): + """ + Initializes a convolutional network module + :param input_shape: The shape of the tensor to be passed into this network + :param num_output_classes: Number of output classes + :param num_filters: Number of filters per convolutional layer + :param num_blocks_per_stage: Number of blocks per "stage". Each block is composed of 2 convolutional layers. + :param num_stages: Number of stages in a network. A stage is defined as a sequence of layers within which the + data dimensionality remains constant in the spacial axis (h, w) and can change in the channel axis. After each stage + there exists a dimensionality reduction stage, composed of two convolutional layers and an avg pooling layer. + :param use_bias: Whether to use biases in our convolutional layers + :param processing_block_type: Type of processing block to use within our stages + :param dimensionality_reduction_block_type: Type of dimensionality reduction block to use after each stage in our network + """ + super(ConvolutionalNetwork, self).__init__() + # set up class attributes useful in building the network and inference + self.input_shape = input_shape + self.num_filters = num_filters + self.num_output_classes = num_output_classes + self.use_bias = use_bias + self.num_blocks_per_stage = num_blocks_per_stage + self.num_stages = num_stages + self.processing_block_type = processing_block_type + self.dimensionality_reduction_block_type = dimensionality_reduction_block_type + + # build the network + self.build_module() + + def build_module(self): + """ + Builds network whilst automatically inferring shapes of layers. + """ + self.layer_dict = nn.ModuleDict() + # initialize a module dict, which is effectively a dictionary that can collect layers and integrate them into pytorch + print("Building basic block of ConvolutionalNetwork using input shape", self.input_shape) + x = torch.zeros((self.input_shape)) # create dummy inputs to be used to infer shapes of layers + + out = x + self.layer_dict['input_conv'] = EntryConvolutionalBlock(input_shape=out.shape, num_filters=self.num_filters, + kernel_size=3, padding=1, bias=self.use_bias, + dilation=1) + out = self.layer_dict['input_conv'].forward(out) + # torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True) + for i in range(self.num_stages): # for number of layers times + for j in range(self.num_blocks_per_stage): + self.layer_dict['block_{}_{}'.format(i, j)] = self.processing_block_type(input_shape=out.shape, + num_filters=self.num_filters, + bias=self.use_bias, + kernel_size=3, dilation=1, + padding=1) + out = self.layer_dict['block_{}_{}'.format(i, j)].forward(out) + self.layer_dict['reduction_block_{}'.format(i)] = self.dimensionality_reduction_block_type( + input_shape=out.shape, + num_filters=self.num_filters, bias=True, + kernel_size=3, dilation=1, + padding=1, + reduction_factor=2) + out = self.layer_dict['reduction_block_{}'.format(i)].forward(out) + + out = F.avg_pool2d(out, out.shape[-1]) + print('shape before final linear layer', out.shape) + out = out.view(out.shape[0], -1) + self.logit_linear_layer = nn.Linear(in_features=out.shape[1], # add a linear layer + out_features=self.num_output_classes, + bias=True) + out = self.logit_linear_layer(out) # apply linear layer on flattened inputs + print("Block is built, output volume is", out.shape) + return out + + def forward(self, x): + """ + Forward propages the network given an input batch + :param x: Inputs x (b, c, h, w) + :return: preds (b, num_classes) + """ + out = x + out = self.layer_dict['input_conv'].forward(out) + for i in range(self.num_stages): # for number of layers times + for j in range(self.num_blocks_per_stage): + out = self.layer_dict['block_{}_{}'.format(i, j)].forward(out) + out = self.layer_dict['reduction_block_{}'.format(i)].forward(out) + + out = F.avg_pool2d(out, out.shape[-1]) + out = out.view(out.shape[0], -1) # flatten outputs from (b, c, h, w) to (b, c*h*w) + out = self.logit_linear_layer(out) # pass through a linear layer to get logits/preds + return out + + def reset_parameters(self): + """ + Re-initialize the network parameters. + """ + for item in self.layer_dict.children(): + try: + item.reset_parameters() + except: + pass + + self.logit_linear_layer.reset_parameters() diff --git a/pytorch_mlp_framework/storage_utils.py b/pytorch_mlp_framework/storage_utils.py new file mode 100644 index 00000000..33fafdc3 --- /dev/null +++ b/pytorch_mlp_framework/storage_utils.py @@ -0,0 +1,70 @@ +import pickle +import os +import csv + + +def save_to_stats_pkl_file(experiment_log_filepath, filename, stats_dict): + summary_filename = os.path.join(experiment_log_filepath, filename) + with open("{}.pkl".format(summary_filename), "wb") as file_writer: + pickle.dump(stats_dict, file_writer) + + +def load_from_stats_pkl_file(experiment_log_filepath, filename): + summary_filename = os.path.join(experiment_log_filepath, filename) + with open("{}.pkl".format(summary_filename), "rb") as file_reader: + stats = pickle.load(file_reader) + + return stats + + +def save_statistics(experiment_log_dir, filename, stats_dict, current_epoch, continue_from_mode=False, save_full_dict=False): + """ + Saves the statistics in stats dict into a csv file. Using the keys as the header entries and the values as the + columns of a particular header entry + :param experiment_log_dir: the log folder dir filepath + :param filename: the name of the csv file + :param stats_dict: the stats dict containing the data to be saved + :param current_epoch: the number of epochs since commencement of the current training session (i.e. if the experiment continued from 100 and this is epoch 105, then pass relative distance of 5.) + :param save_full_dict: whether to save the full dict as is overriding any previous entries (might be useful if we want to overwrite a file) + :return: The filepath to the summary file + """ + summary_filename = os.path.join(experiment_log_dir, filename) + mode = 'a' if continue_from_mode else 'w' + with open(summary_filename, mode) as f: + writer = csv.writer(f) + if not continue_from_mode: + writer.writerow(list(stats_dict.keys())) + + if save_full_dict: + total_rows = len(list(stats_dict.values())[0]) + for idx in range(total_rows): + row_to_add = [value[idx] for value in list(stats_dict.values())] + writer.writerow(row_to_add) + else: + row_to_add = [value[current_epoch] for value in list(stats_dict.values())] + writer.writerow(row_to_add) + + return summary_filename + + +def load_statistics(experiment_log_dir, filename): + """ + Loads a statistics csv file into a dictionary + :param experiment_log_dir: the log folder dir filepath + :param filename: the name of the csv file to load + :return: A dictionary containing the stats in the csv file. Header entries are converted into keys and columns of a + particular header are converted into values of a key in a list format. + """ + summary_filename = os.path.join(experiment_log_dir, filename) + + with open(summary_filename, 'r+') as f: + lines = f.readlines() + + keys = lines[0].split(",") + stats = {key: [] for key in keys} + for line in lines[1:]: + values = line.split(",") + for idx, value in enumerate(values): + stats[keys[idx]].append(value) + + return stats diff --git a/pytorch_mlp_framework/train_evaluate_image_classification_system.py b/pytorch_mlp_framework/train_evaluate_image_classification_system.py new file mode 100644 index 00000000..a8b49957 --- /dev/null +++ b/pytorch_mlp_framework/train_evaluate_image_classification_system.py @@ -0,0 +1,68 @@ +import numpy as np +import torch +from torch.utils.data import DataLoader +from torchvision import transforms + +import mlp.data_providers as data_providers +from pytorch_mlp_framework.arg_extractor import get_args +from pytorch_mlp_framework.experiment_builder import ExperimentBuilder +from pytorch_mlp_framework.model_architectures import * +import os +# os.environ["CUDA_VISIBLE_DEVICES"]="0" + +args = get_args() # get arguments from command line +rng = np.random.RandomState(seed=args.seed) # set the seeds for the experiment +torch.manual_seed(seed=args.seed) # sets pytorch's seed + +# set up data augmentation transforms for training and testing +transform_train = transforms.Compose([ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ]) + +transform_test = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), +]) + +train_data = data_providers.CIFAR100(root='data', set_name='train', + transform=transform_train, + download=True) # initialize our rngs using the argument set seed +val_data = data_providers.CIFAR100(root='data', set_name='val', + transform=transform_test, + download=True) # initialize our rngs using the argument set seed +test_data = data_providers.CIFAR100(root='data', set_name='test', + transform=transform_test, + download=True) # initialize our rngs using the argument set seed + +train_data_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=4) +val_data_loader = DataLoader(val_data, batch_size=args.batch_size, shuffle=True, num_workers=4) +test_data_loader = DataLoader(test_data, batch_size=args.batch_size, shuffle=True, num_workers=4) + +if args.block_type == 'conv_block': + processing_block_type = ConvolutionalProcessingBlock + dim_reduction_block_type = ConvolutionalDimensionalityReductionBlock +elif args.block_type == 'empty_block': + processing_block_type = EmptyBlock + dim_reduction_block_type = EmptyBlock +else: + raise ModuleNotFoundError + +custom_conv_net = ConvolutionalNetwork( # initialize our network object, in this case a ConvNet + input_shape=(args.batch_size, args.image_num_channels, args.image_height, args.image_width), + num_output_classes=args.num_classes, num_filters=args.num_filters, use_bias=False, + num_blocks_per_stage=args.num_blocks_per_stage, num_stages=args.num_stages, + processing_block_type=processing_block_type, + dimensionality_reduction_block_type=dim_reduction_block_type) + +conv_experiment = ExperimentBuilder(network_model=custom_conv_net, + experiment_name=args.experiment_name, + num_epochs=args.num_epochs, + weight_decay_coefficient=args.weight_decay_coefficient, + use_gpu=args.use_gpu, + continue_from_epoch=args.continue_from_epoch, + train_data=train_data_loader, val_data=val_data_loader, + test_data=test_data_loader) # build an experiment object +experiment_metrics, test_metrics = conv_experiment.run_experiment() # run experiment and return experiment metrics diff --git a/report/README.txt b/report/README.txt new file mode 100644 index 00000000..8d6cff60 --- /dev/null +++ b/report/README.txt @@ -0,0 +1 @@ +Most reasonable LaTeX distributions should have no problem building the document from what is in the provided LaTeX source directory. However certain LaTeX distributions are missing certain files, and the they are included in this directory. If you get an error message when you build the LaTeX document saying one of these files is missing, then move the relevant file into your latex source directory. diff --git a/report/additional-latex-files/README.txt b/report/additional-latex-files/README.txt new file mode 100644 index 00000000..8d6cff60 --- /dev/null +++ b/report/additional-latex-files/README.txt @@ -0,0 +1 @@ +Most reasonable LaTeX distributions should have no problem building the document from what is in the provided LaTeX source directory. However certain LaTeX distributions are missing certain files, and the they are included in this directory. If you get an error message when you build the LaTeX document saying one of these files is missing, then move the relevant file into your latex source directory. diff --git a/report/additional-latex-files/algorithm.sty b/report/additional-latex-files/algorithm.sty new file mode 100644 index 00000000..843e3d5b --- /dev/null +++ b/report/additional-latex-files/algorithm.sty @@ -0,0 +1,79 @@ +% ALGORITHM STYLE -- Released 8 April 1996 +% for LaTeX-2e +% Copyright -- 1994 Peter Williams +% E-mail Peter.Williams@dsto.defence.gov.au +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{algorithm} +\typeout{Document Style `algorithm' - floating environment} + +\RequirePackage{float} +\RequirePackage{ifthen} +\newcommand{\ALG@within}{nothing} +\newboolean{ALG@within} +\setboolean{ALG@within}{false} +\newcommand{\ALG@floatstyle}{ruled} +\newcommand{\ALG@name}{Algorithm} +\newcommand{\listalgorithmname}{List of \ALG@name s} + +% Declare Options +% first appearance +\DeclareOption{plain}{ + \renewcommand{\ALG@floatstyle}{plain} +} +\DeclareOption{ruled}{ + \renewcommand{\ALG@floatstyle}{ruled} +} +\DeclareOption{boxed}{ + \renewcommand{\ALG@floatstyle}{boxed} +} +% then numbering convention +\DeclareOption{part}{ + \renewcommand{\ALG@within}{part} + \setboolean{ALG@within}{true} +} +\DeclareOption{chapter}{ + \renewcommand{\ALG@within}{chapter} + \setboolean{ALG@within}{true} +} +\DeclareOption{section}{ + \renewcommand{\ALG@within}{section} + \setboolean{ALG@within}{true} +} +\DeclareOption{subsection}{ + \renewcommand{\ALG@within}{subsection} + \setboolean{ALG@within}{true} +} +\DeclareOption{subsubsection}{ + \renewcommand{\ALG@within}{subsubsection} + \setboolean{ALG@within}{true} +} +\DeclareOption{nothing}{ + \renewcommand{\ALG@within}{nothing} + \setboolean{ALG@within}{true} +} +\DeclareOption*{\edef\ALG@name{\CurrentOption}} + +% ALGORITHM +% +\ProcessOptions +\floatstyle{\ALG@floatstyle} +\ifthenelse{\boolean{ALG@within}}{ + \ifthenelse{\equal{\ALG@within}{part}} + {\newfloat{algorithm}{htbp}{loa}[part]}{} + \ifthenelse{\equal{\ALG@within}{chapter}} + {\newfloat{algorithm}{htbp}{loa}[chapter]}{} + \ifthenelse{\equal{\ALG@within}{section}} + {\newfloat{algorithm}{htbp}{loa}[section]}{} + \ifthenelse{\equal{\ALG@within}{subsection}} + {\newfloat{algorithm}{htbp}{loa}[subsection]}{} + \ifthenelse{\equal{\ALG@within}{subsubsection}} + {\newfloat{algorithm}{htbp}{loa}[subsubsection]}{} + \ifthenelse{\equal{\ALG@within}{nothing}} + {\newfloat{algorithm}{htbp}{loa}}{} +}{ + \newfloat{algorithm}{htbp}{loa} +} +\floatname{algorithm}{\ALG@name} + +\newcommand{\listofalgorithms}{\listof{algorithm}{\listalgorithmname}} + diff --git a/report/additional-latex-files/algorithmic.sty b/report/additional-latex-files/algorithmic.sty new file mode 100644 index 00000000..ad614783 --- /dev/null +++ b/report/additional-latex-files/algorithmic.sty @@ -0,0 +1,201 @@ +% ALGORITHMIC STYLE -- Released 8 APRIL 1996 +% for LaTeX version 2e +% Copyright -- 1994 Peter Williams +% E-mail PeterWilliams@dsto.defence.gov.au +% +% Modified by Alex Smola (08/2000) +% E-mail Alex.Smola@anu.edu.au +% +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{algorithmic} +\typeout{Document Style `algorithmic' - environment} +% +\RequirePackage{ifthen} +\RequirePackage{calc} +\newboolean{ALC@noend} +\setboolean{ALC@noend}{false} +\newcounter{ALC@line} +\newcounter{ALC@rem} +\newlength{\ALC@tlm} +% +\DeclareOption{noend}{\setboolean{ALC@noend}{true}} +% +\ProcessOptions +% +% ALGORITHMIC +\newcommand{\algorithmicrequire}{\textbf{Require:}} +\newcommand{\algorithmicensure}{\textbf{Ensure:}} +\newcommand{\algorithmiccomment}[1]{\{#1\}} +\newcommand{\algorithmicend}{\textbf{end}} +\newcommand{\algorithmicif}{\textbf{if}} +\newcommand{\algorithmicthen}{\textbf{then}} +\newcommand{\algorithmicelse}{\textbf{else}} +\newcommand{\algorithmicelsif}{\algorithmicelse\ \algorithmicif} +\newcommand{\algorithmicendif}{\algorithmicend\ \algorithmicif} +\newcommand{\algorithmicfor}{\textbf{for}} +\newcommand{\algorithmicforall}{\textbf{for all}} +\newcommand{\algorithmicdo}{\textbf{do}} +\newcommand{\algorithmicendfor}{\algorithmicend\ \algorithmicfor} +\newcommand{\algorithmicwhile}{\textbf{while}} +\newcommand{\algorithmicendwhile}{\algorithmicend\ \algorithmicwhile} +\newcommand{\algorithmicloop}{\textbf{loop}} +\newcommand{\algorithmicendloop}{\algorithmicend\ \algorithmicloop} +\newcommand{\algorithmicrepeat}{\textbf{repeat}} +\newcommand{\algorithmicuntil}{\textbf{until}} + +%changed by alex smola +\newcommand{\algorithmicinput}{\textbf{input}} +\newcommand{\algorithmicoutput}{\textbf{output}} +\newcommand{\algorithmicset}{\textbf{set}} +\newcommand{\algorithmictrue}{\textbf{true}} +\newcommand{\algorithmicfalse}{\textbf{false}} +\newcommand{\algorithmicand}{\textbf{and\ }} +\newcommand{\algorithmicor}{\textbf{or\ }} +\newcommand{\algorithmicfunction}{\textbf{function}} +\newcommand{\algorithmicendfunction}{\algorithmicend\ \algorithmicfunction} +\newcommand{\algorithmicmain}{\textbf{main}} +\newcommand{\algorithmicendmain}{\algorithmicend\ \algorithmicmain} +%end changed by alex smola + +\def\ALC@item[#1]{% +\if@noparitem \@donoparitem + \else \if@inlabel \indent \par \fi + \ifhmode \unskip\unskip \par \fi + \if@newlist \if@nobreak \@nbitem \else + \addpenalty\@beginparpenalty + \addvspace\@topsep \addvspace{-\parskip}\fi + \else \addpenalty\@itempenalty \addvspace\itemsep + \fi + \global\@inlabeltrue +\fi +\everypar{\global\@minipagefalse\global\@newlistfalse + \if@inlabel\global\@inlabelfalse \hskip -\parindent \box\@labels + \penalty\z@ \fi + \everypar{}}\global\@nobreakfalse +\if@noitemarg \@noitemargfalse \if@nmbrlist \refstepcounter{\@listctr}\fi \fi +\sbox\@tempboxa{\makelabel{#1}}% +\global\setbox\@labels + \hbox{\unhbox\@labels \hskip \itemindent + \hskip -\labelwidth \hskip -\ALC@tlm + \ifdim \wd\@tempboxa >\labelwidth + \box\@tempboxa + \else \hbox to\labelwidth {\unhbox\@tempboxa}\fi + \hskip \ALC@tlm}\ignorespaces} +% +\newenvironment{algorithmic}[1][0]{ +\let\@item\ALC@item + \newcommand{\ALC@lno}{% +\ifthenelse{\equal{\arabic{ALC@rem}}{0}} +{{\footnotesize \arabic{ALC@line}:}}{}% +} +\let\@listii\@listi +\let\@listiii\@listi +\let\@listiv\@listi +\let\@listv\@listi +\let\@listvi\@listi +\let\@listvii\@listi + \newenvironment{ALC@g}{ + \begin{list}{\ALC@lno}{ \itemsep\z@ \itemindent\z@ + \listparindent\z@ \rightmargin\z@ + \topsep\z@ \partopsep\z@ \parskip\z@\parsep\z@ + \leftmargin 1em + \addtolength{\ALC@tlm}{\leftmargin} + } + } + {\end{list}} + \newcommand{\ALC@it}{\addtocounter{ALC@line}{1}\addtocounter{ALC@rem}{1}\ifthenelse{\equal{\arabic{ALC@rem}}{#1}}{\setcounter{ALC@rem}{0}}{}\item} + \newcommand{\ALC@com}[1]{\ifthenelse{\equal{##1}{default}}% +{}{\ \algorithmiccomment{##1}}} + \newcommand{\REQUIRE}{\item[\algorithmicrequire]} + \newcommand{\ENSURE}{\item[\algorithmicensure]} + \newcommand{\STATE}{\ALC@it} + \newcommand{\COMMENT}[1]{\algorithmiccomment{##1}} +%changes by alex smola + \newcommand{\INPUT}{\item[\algorithmicinput]} + \newcommand{\OUTPUT}{\item[\algorithmicoutput]} + \newcommand{\SET}{\item[\algorithmicset]} +% \newcommand{\TRUE}{\algorithmictrue} +% \newcommand{\FALSE}{\algorithmicfalse} + \newcommand{\AND}{\algorithmicand} + \newcommand{\OR}{\algorithmicor} + \newenvironment{ALC@func}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@main}{\begin{ALC@g}}{\end{ALC@g}} +%end changes by alex smola + \newenvironment{ALC@if}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@for}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@whl}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@loop}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@rpt}{\begin{ALC@g}}{\end{ALC@g}} + \renewcommand{\\}{\@centercr} + \newcommand{\IF}[2][default]{\ALC@it\algorithmicif\ ##2\ \algorithmicthen% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\SHORTIF}[2]{\ALC@it\algorithmicif\ ##1\ + \algorithmicthen\ {##2}} + \newcommand{\ELSE}[1][default]{\end{ALC@if}\ALC@it\algorithmicelse% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\ELSIF}[2][default]% +{\end{ALC@if}\ALC@it\algorithmicelsif\ ##2\ \algorithmicthen% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\FOR}[2][default]{\ALC@it\algorithmicfor\ ##2\ \algorithmicdo% +\ALC@com{##1}\begin{ALC@for}} + \newcommand{\FORALL}[2][default]{\ALC@it\algorithmicforall\ ##2\ % +\algorithmicdo% +\ALC@com{##1}\begin{ALC@for}} + \newcommand{\SHORTFORALL}[2]{\ALC@it\algorithmicforall\ ##1\ % + \algorithmicdo\ {##2}} + \newcommand{\WHILE}[2][default]{\ALC@it\algorithmicwhile\ ##2\ % +\algorithmicdo% +\ALC@com{##1}\begin{ALC@whl}} + \newcommand{\LOOP}[1][default]{\ALC@it\algorithmicloop% +\ALC@com{##1}\begin{ALC@loop}} +%changed by alex smola + \newcommand{\FUNCTION}[2][default]{\ALC@it\algorithmicfunction\ ##2\ % + \ALC@com{##1}\begin{ALC@func}} + \newcommand{\MAIN}[2][default]{\ALC@it\algorithmicmain\ ##2\ % + \ALC@com{##1}\begin{ALC@main}} +%end changed by alex smola + \newcommand{\REPEAT}[1][default]{\ALC@it\algorithmicrepeat% + \ALC@com{##1}\begin{ALC@rpt}} + \newcommand{\UNTIL}[1]{\end{ALC@rpt}\ALC@it\algorithmicuntil\ ##1} + \ifthenelse{\boolean{ALC@noend}}{ + \newcommand{\ENDIF}{\end{ALC@if}} + \newcommand{\ENDFOR}{\end{ALC@for}} + \newcommand{\ENDWHILE}{\end{ALC@whl}} + \newcommand{\ENDLOOP}{\end{ALC@loop}} + \newcommand{\ENDFUNCTION}{\end{ALC@func}} + \newcommand{\ENDMAIN}{\end{ALC@main}} + }{ + \newcommand{\ENDIF}{\end{ALC@if}\ALC@it\algorithmicendif} + \newcommand{\ENDFOR}{\end{ALC@for}\ALC@it\algorithmicendfor} + \newcommand{\ENDWHILE}{\end{ALC@whl}\ALC@it\algorithmicendwhile} + \newcommand{\ENDLOOP}{\end{ALC@loop}\ALC@it\algorithmicendloop} + \newcommand{\ENDFUNCTION}{\end{ALC@func}\ALC@it\algorithmicendfunction} + \newcommand{\ENDMAIN}{\end{ALC@main}\ALC@it\algorithmicendmain} + } + \renewcommand{\@toodeep}{} + \begin{list}{\ALC@lno}{\setcounter{ALC@line}{0}\setcounter{ALC@rem}{0}% + \itemsep\z@ \itemindent\z@ \listparindent\z@% + \partopsep\z@ \parskip\z@ \parsep\z@% + \labelsep 0.5em \topsep 0.2em% + \ifthenelse{\equal{#1}{0}} + {\labelwidth 0.5em } + {\labelwidth 1.2em } + \leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep} + \ALC@tlm\labelsep + } + } + {\end{list}} + + + + + + + + + + + + + + diff --git a/report/additional-latex-files/fancyhdr.sty b/report/additional-latex-files/fancyhdr.sty new file mode 100644 index 00000000..77ed4e30 --- /dev/null +++ b/report/additional-latex-files/fancyhdr.sty @@ -0,0 +1,485 @@ +% fancyhdr.sty version 3.2 +% Fancy headers and footers for LaTeX. +% Piet van Oostrum, +% Dept of Computer and Information Sciences, University of Utrecht, +% Padualaan 14, P.O. Box 80.089, 3508 TB Utrecht, The Netherlands +% Telephone: +31 30 2532180. Email: piet@cs.uu.nl +% ======================================================================== +% LICENCE: +% This file may be distributed under the terms of the LaTeX Project Public +% License, as described in lppl.txt in the base LaTeX distribution. +% Either version 1 or, at your option, any later version. +% ======================================================================== +% MODIFICATION HISTORY: +% Sep 16, 1994 +% version 1.4: Correction for use with \reversemargin +% Sep 29, 1994: +% version 1.5: Added the \iftopfloat, \ifbotfloat and \iffloatpage commands +% Oct 4, 1994: +% version 1.6: Reset single spacing in headers/footers for use with +% setspace.sty or doublespace.sty +% Oct 4, 1994: +% version 1.7: changed \let\@mkboth\markboth to +% \def\@mkboth{\protect\markboth} to make it more robust +% Dec 5, 1994: +% version 1.8: corrections for amsbook/amsart: define \@chapapp and (more +% importantly) use the \chapter/sectionmark definitions from ps@headings if +% they exist (which should be true for all standard classes). +% May 31, 1995: +% version 1.9: The proposed \renewcommand{\headrulewidth}{\iffloatpage... +% construction in the doc did not work properly with the fancyplain style. +% June 1, 1995: +% version 1.91: The definition of \@mkboth wasn't restored on subsequent +% \pagestyle{fancy}'s. +% June 1, 1995: +% version 1.92: The sequence \pagestyle{fancyplain} \pagestyle{plain} +% \pagestyle{fancy} would erroneously select the plain version. +% June 1, 1995: +% version 1.93: \fancypagestyle command added. +% Dec 11, 1995: +% version 1.94: suggested by Conrad Hughes +% CJCH, Dec 11, 1995: added \footruleskip to allow control over footrule +% position (old hardcoded value of .3\normalbaselineskip is far too high +% when used with very small footer fonts). +% Jan 31, 1996: +% version 1.95: call \@normalsize in the reset code if that is defined, +% otherwise \normalsize. +% this is to solve a problem with ucthesis.cls, as this doesn't +% define \@currsize. Unfortunately for latex209 calling \normalsize doesn't +% work as this is optimized to do very little, so there \@normalsize should +% be called. Hopefully this code works for all versions of LaTeX known to +% mankind. +% April 25, 1996: +% version 1.96: initialize \headwidth to a magic (negative) value to catch +% most common cases that people change it before calling \pagestyle{fancy}. +% Note it can't be initialized when reading in this file, because +% \textwidth could be changed afterwards. This is quite probable. +% We also switch to \MakeUppercase rather than \uppercase and introduce a +% \nouppercase command for use in headers. and footers. +% May 3, 1996: +% version 1.97: Two changes: +% 1. Undo the change in version 1.8 (using the pagestyle{headings} defaults +% for the chapter and section marks. The current version of amsbook and +% amsart classes don't seem to need them anymore. Moreover the standard +% latex classes don't use \markboth if twoside isn't selected, and this is +% confusing as \leftmark doesn't work as expected. +% 2. include a call to \ps@empty in ps@@fancy. This is to solve a problem +% in the amsbook and amsart classes, that make global changes to \topskip, +% which are reset in \ps@empty. Hopefully this doesn't break other things. +% May 7, 1996: +% version 1.98: +% Added % after the line \def\nouppercase +% May 7, 1996: +% version 1.99: This is the alpha version of fancyhdr 2.0 +% Introduced the new commands \fancyhead, \fancyfoot, and \fancyhf. +% Changed \headrulewidth, \footrulewidth, \footruleskip to +% macros rather than length parameters, In this way they can be +% conditionalized and they don't consume length registers. There is no need +% to have them as length registers unless you want to do calculations with +% them, which is unlikely. Note that this may make some uses of them +% incompatible (i.e. if you have a file that uses \setlength or \xxxx=) +% May 10, 1996: +% version 1.99a: +% Added a few more % signs +% May 10, 1996: +% version 1.99b: +% Changed the syntax of \f@nfor to be resistent to catcode changes of := +% Removed the [1] from the defs of \lhead etc. because the parameter is +% consumed by the \@[xy]lhead etc. macros. +% June 24, 1997: +% version 1.99c: +% corrected \nouppercase to also include the protected form of \MakeUppercase +% \global added to manipulation of \headwidth. +% \iffootnote command added. +% Some comments added about \@fancyhead and \@fancyfoot. +% Aug 24, 1998 +% version 1.99d +% Changed the default \ps@empty to \ps@@empty in order to allow +% \fancypagestyle{empty} redefinition. +% Oct 11, 2000 +% version 2.0 +% Added LPPL license clause. +% +% A check for \headheight is added. An errormessage is given (once) if the +% header is too large. Empty headers don't generate the error even if +% \headheight is very small or even 0pt. +% Warning added for the use of 'E' option when twoside option is not used. +% In this case the 'E' fields will never be used. +% +% Mar 10, 2002 +% version 2.1beta +% New command: \fancyhfoffset[place]{length} +% defines offsets to be applied to the header/footer to let it stick into +% the margins (if length > 0). +% place is like in fancyhead, except that only E,O,L,R can be used. +% This replaces the old calculation based on \headwidth and the marginpar +% area. +% \headwidth will be dynamically calculated in the headers/footers when +% this is used. +% +% Mar 26, 2002 +% version 2.1beta2 +% \fancyhfoffset now also takes h,f as possible letters in the argument to +% allow the header and footer widths to be different. +% New commands \fancyheadoffset and \fancyfootoffset added comparable to +% \fancyhead and \fancyfoot. +% Errormessages and warnings have been made more informative. +% +% Dec 9, 2002 +% version 2.1 +% The defaults for \footrulewidth, \plainheadrulewidth and +% \plainfootrulewidth are changed from \z@skip to 0pt. In this way when +% someone inadvertantly uses \setlength to change any of these, the value +% of \z@skip will not be changed, rather an errormessage will be given. + +% March 3, 2004 +% Release of version 3.0 + +% Oct 7, 2004 +% version 3.1 +% Added '\endlinechar=13' to \fancy@reset to prevent problems with +% includegraphics in header when verbatiminput is active. + +% March 22, 2005 +% version 3.2 +% reset \everypar (the real one) in \fancy@reset because spanish.ldf does +% strange things with \everypar between << and >>. + +\def\ifancy@mpty#1{\def\temp@a{#1}\ifx\temp@a\@empty} + +\def\fancy@def#1#2{\ifancy@mpty{#2}\fancy@gbl\def#1{\leavevmode}\else + \fancy@gbl\def#1{#2\strut}\fi} + +\let\fancy@gbl\global + +\def\@fancyerrmsg#1{% + \ifx\PackageError\undefined + \errmessage{#1}\else + \PackageError{Fancyhdr}{#1}{}\fi} +\def\@fancywarning#1{% + \ifx\PackageWarning\undefined + \errmessage{#1}\else + \PackageWarning{Fancyhdr}{#1}{}\fi} + +% Usage: \@forc \var{charstring}{command to be executed for each char} +% This is similar to LaTeX's \@tfor, but expands the charstring. + +\def\@forc#1#2#3{\expandafter\f@rc\expandafter#1\expandafter{#2}{#3}} +\def\f@rc#1#2#3{\def\temp@ty{#2}\ifx\@empty\temp@ty\else + \f@@rc#1#2\f@@rc{#3}\fi} +\def\f@@rc#1#2#3\f@@rc#4{\def#1{#2}#4\f@rc#1{#3}{#4}} + +% Usage: \f@nfor\name:=list\do{body} +% Like LaTeX's \@for but an empty list is treated as a list with an empty +% element + +\newcommand{\f@nfor}[3]{\edef\@fortmp{#2}% + \expandafter\@forloop#2,\@nil,\@nil\@@#1{#3}} + +% Usage: \def@ult \cs{defaults}{argument} +% sets \cs to the characters from defaults appearing in argument +% or defaults if it would be empty. All characters are lowercased. + +\newcommand\def@ult[3]{% + \edef\temp@a{\lowercase{\edef\noexpand\temp@a{#3}}}\temp@a + \def#1{}% + \@forc\tmpf@ra{#2}% + {\expandafter\if@in\tmpf@ra\temp@a{\edef#1{#1\tmpf@ra}}{}}% + \ifx\@empty#1\def#1{#2}\fi} +% +% \if@in +% +\newcommand{\if@in}[4]{% + \edef\temp@a{#2}\def\temp@b##1#1##2\temp@b{\def\temp@b{##1}}% + \expandafter\temp@b#2#1\temp@b\ifx\temp@a\temp@b #4\else #3\fi} + +\newcommand{\fancyhead}{\@ifnextchar[{\f@ncyhf\fancyhead h}% + {\f@ncyhf\fancyhead h[]}} +\newcommand{\fancyfoot}{\@ifnextchar[{\f@ncyhf\fancyfoot f}% + {\f@ncyhf\fancyfoot f[]}} +\newcommand{\fancyhf}{\@ifnextchar[{\f@ncyhf\fancyhf{}}% + {\f@ncyhf\fancyhf{}[]}} + +% New commands for offsets added + +\newcommand{\fancyheadoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyheadoffset h}% + {\f@ncyhfoffs\fancyheadoffset h[]}} +\newcommand{\fancyfootoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyfootoffset f}% + {\f@ncyhfoffs\fancyfootoffset f[]}} +\newcommand{\fancyhfoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyhfoffset{}}% + {\f@ncyhfoffs\fancyhfoffset{}[]}} + +% The header and footer fields are stored in command sequences with +% names of the form: \f@ncy with for [eo], from [lcr] +% and from [hf]. + +\def\f@ncyhf#1#2[#3]#4{% + \def\temp@c{}% + \@forc\tmpf@ra{#3}% + {\expandafter\if@in\tmpf@ra{eolcrhf,EOLCRHF}% + {}{\edef\temp@c{\temp@c\tmpf@ra}}}% + \ifx\@empty\temp@c\else + \@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument: + [#3]}% + \fi + \f@nfor\temp@c{#3}% + {\def@ult\f@@@eo{eo}\temp@c + \if@twoside\else + \if\f@@@eo e\@fancywarning + {\string#1's `E' option without twoside option is useless}\fi\fi + \def@ult\f@@@lcr{lcr}\temp@c + \def@ult\f@@@hf{hf}{#2\temp@c}% + \@forc\f@@eo\f@@@eo + {\@forc\f@@lcr\f@@@lcr + {\@forc\f@@hf\f@@@hf + {\expandafter\fancy@def\csname + f@ncy\f@@eo\f@@lcr\f@@hf\endcsname + {#4}}}}}} + +\def\f@ncyhfoffs#1#2[#3]#4{% + \def\temp@c{}% + \@forc\tmpf@ra{#3}% + {\expandafter\if@in\tmpf@ra{eolrhf,EOLRHF}% + {}{\edef\temp@c{\temp@c\tmpf@ra}}}% + \ifx\@empty\temp@c\else + \@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument: + [#3]}% + \fi + \f@nfor\temp@c{#3}% + {\def@ult\f@@@eo{eo}\temp@c + \if@twoside\else + \if\f@@@eo e\@fancywarning + {\string#1's `E' option without twoside option is useless}\fi\fi + \def@ult\f@@@lcr{lr}\temp@c + \def@ult\f@@@hf{hf}{#2\temp@c}% + \@forc\f@@eo\f@@@eo + {\@forc\f@@lcr\f@@@lcr + {\@forc\f@@hf\f@@@hf + {\expandafter\setlength\csname + f@ncyO@\f@@eo\f@@lcr\f@@hf\endcsname + {#4}}}}}% + \fancy@setoffs} + +% Fancyheadings version 1 commands. These are more or less deprecated, +% but they continue to work. + +\newcommand{\lhead}{\@ifnextchar[{\@xlhead}{\@ylhead}} +\def\@xlhead[#1]#2{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#2}} +\def\@ylhead#1{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#1}} + +\newcommand{\chead}{\@ifnextchar[{\@xchead}{\@ychead}} +\def\@xchead[#1]#2{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#2}} +\def\@ychead#1{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#1}} + +\newcommand{\rhead}{\@ifnextchar[{\@xrhead}{\@yrhead}} +\def\@xrhead[#1]#2{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#2}} +\def\@yrhead#1{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#1}} + +\newcommand{\lfoot}{\@ifnextchar[{\@xlfoot}{\@ylfoot}} +\def\@xlfoot[#1]#2{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#2}} +\def\@ylfoot#1{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#1}} + +\newcommand{\cfoot}{\@ifnextchar[{\@xcfoot}{\@ycfoot}} +\def\@xcfoot[#1]#2{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#2}} +\def\@ycfoot#1{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#1}} + +\newcommand{\rfoot}{\@ifnextchar[{\@xrfoot}{\@yrfoot}} +\def\@xrfoot[#1]#2{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#2}} +\def\@yrfoot#1{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#1}} + +\newlength{\fancy@headwidth} +\let\headwidth\fancy@headwidth +\newlength{\f@ncyO@elh} +\newlength{\f@ncyO@erh} +\newlength{\f@ncyO@olh} +\newlength{\f@ncyO@orh} +\newlength{\f@ncyO@elf} +\newlength{\f@ncyO@erf} +\newlength{\f@ncyO@olf} +\newlength{\f@ncyO@orf} +\newcommand{\headrulewidth}{0.4pt} +\newcommand{\footrulewidth}{0pt} +\newcommand{\footruleskip}{.3\normalbaselineskip} + +% Fancyplain stuff shouldn't be used anymore (rather +% \fancypagestyle{plain} should be used), but it must be present for +% compatibility reasons. + +\newcommand{\plainheadrulewidth}{0pt} +\newcommand{\plainfootrulewidth}{0pt} +\newif\if@fancyplain \@fancyplainfalse +\def\fancyplain#1#2{\if@fancyplain#1\else#2\fi} + +\headwidth=-123456789sp %magic constant + +% Command to reset various things in the headers: +% a.o. single spacing (taken from setspace.sty) +% and the catcode of ^^M (so that epsf files in the header work if a +% verbatim crosses a page boundary) +% It also defines a \nouppercase command that disables \uppercase and +% \Makeuppercase. It can only be used in the headers and footers. +\let\fnch@everypar\everypar% save real \everypar because of spanish.ldf +\def\fancy@reset{\fnch@everypar{}\restorecr\endlinechar=13 + \def\baselinestretch{1}% + \def\nouppercase##1{{\let\uppercase\relax\let\MakeUppercase\relax + \expandafter\let\csname MakeUppercase \endcsname\relax##1}}% + \ifx\undefined\@newbaseline% NFSS not present; 2.09 or 2e + \ifx\@normalsize\undefined \normalsize % for ucthesis.cls + \else \@normalsize \fi + \else% NFSS (2.09) present + \@newbaseline% + \fi} + +% Initialization of the head and foot text. + +% The default values still contain \fancyplain for compatibility. +\fancyhf{} % clear all +% lefthead empty on ``plain'' pages, \rightmark on even, \leftmark on odd pages +% evenhead empty on ``plain'' pages, \leftmark on even, \rightmark on odd pages +\if@twoside + \fancyhead[el,or]{\fancyplain{}{\sl\rightmark}} + \fancyhead[er,ol]{\fancyplain{}{\sl\leftmark}} +\else + \fancyhead[l]{\fancyplain{}{\sl\rightmark}} + \fancyhead[r]{\fancyplain{}{\sl\leftmark}} +\fi +\fancyfoot[c]{\rm\thepage} % page number + +% Use box 0 as a temp box and dimen 0 as temp dimen. +% This can be done, because this code will always +% be used inside another box, and therefore the changes are local. + +\def\@fancyvbox#1#2{\setbox0\vbox{#2}\ifdim\ht0>#1\@fancywarning + {\string#1 is too small (\the#1): ^^J Make it at least \the\ht0.^^J + We now make it that large for the rest of the document.^^J + This may cause the page layout to be inconsistent, however\@gobble}% + \dimen0=#1\global\setlength{#1}{\ht0}\ht0=\dimen0\fi + \box0} + +% Put together a header or footer given the left, center and +% right text, fillers at left and right and a rule. +% The \lap commands put the text into an hbox of zero size, +% so overlapping text does not generate an errormessage. +% These macros have 5 parameters: +% 1. LEFTSIDE BEARING % This determines at which side the header will stick +% out. When \fancyhfoffset is used this calculates \headwidth, otherwise +% it is \hss or \relax (after expansion). +% 2. \f@ncyolh, \f@ncyelh, \f@ncyolf or \f@ncyelf. This is the left component. +% 3. \f@ncyoch, \f@ncyech, \f@ncyocf or \f@ncyecf. This is the middle comp. +% 4. \f@ncyorh, \f@ncyerh, \f@ncyorf or \f@ncyerf. This is the right component. +% 5. RIGHTSIDE BEARING. This is always \relax or \hss (after expansion). + +\def\@fancyhead#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset + \@fancyvbox\headheight{\hbox + {\rlap{\parbox[b]{\headwidth}{\raggedright#2}}\hfill + \parbox[b]{\headwidth}{\centering#3}\hfill + \llap{\parbox[b]{\headwidth}{\raggedleft#4}}}\headrule}}#5} + +\def\@fancyfoot#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset + \@fancyvbox\footskip{\footrule + \hbox{\rlap{\parbox[t]{\headwidth}{\raggedright#2}}\hfill + \parbox[t]{\headwidth}{\centering#3}\hfill + \llap{\parbox[t]{\headwidth}{\raggedleft#4}}}}}#5} + +\def\headrule{{\if@fancyplain\let\headrulewidth\plainheadrulewidth\fi + \hrule\@height\headrulewidth\@width\headwidth \vskip-\headrulewidth}} + +\def\footrule{{\if@fancyplain\let\footrulewidth\plainfootrulewidth\fi + \vskip-\footruleskip\vskip-\footrulewidth + \hrule\@width\headwidth\@height\footrulewidth\vskip\footruleskip}} + +\def\ps@fancy{% +\@ifundefined{@chapapp}{\let\@chapapp\chaptername}{}%for amsbook +% +% Define \MakeUppercase for old LaTeXen. +% Note: we used \def rather than \let, so that \let\uppercase\relax (from +% the version 1 documentation) will still work. +% +\@ifundefined{MakeUppercase}{\def\MakeUppercase{\uppercase}}{}% +\@ifundefined{chapter}{\def\sectionmark##1{\markboth +{\MakeUppercase{\ifnum \c@secnumdepth>\z@ + \thesection\hskip 1em\relax \fi ##1}}{}}% +\def\subsectionmark##1{\markright {\ifnum \c@secnumdepth >\@ne + \thesubsection\hskip 1em\relax \fi ##1}}}% +{\def\chaptermark##1{\markboth {\MakeUppercase{\ifnum \c@secnumdepth>\m@ne + \@chapapp\ \thechapter. \ \fi ##1}}{}}% +\def\sectionmark##1{\markright{\MakeUppercase{\ifnum \c@secnumdepth >\z@ + \thesection. \ \fi ##1}}}}% +%\csname ps@headings\endcsname % use \ps@headings defaults if they exist +\ps@@fancy +\gdef\ps@fancy{\@fancyplainfalse\ps@@fancy}% +% Initialize \headwidth if the user didn't +% +\ifdim\headwidth<0sp +% +% This catches the case that \headwidth hasn't been initialized and the +% case that the user added something to \headwidth in the expectation that +% it was initialized to \textwidth. We compensate this now. This loses if +% the user intended to multiply it by a factor. But that case is more +% likely done by saying something like \headwidth=1.2\textwidth. +% The doc says you have to change \headwidth after the first call to +% \pagestyle{fancy}. This code is just to catch the most common cases were +% that requirement is violated. +% + \global\advance\headwidth123456789sp\global\advance\headwidth\textwidth +\fi} +\def\ps@fancyplain{\ps@fancy \let\ps@plain\ps@plain@fancy} +\def\ps@plain@fancy{\@fancyplaintrue\ps@@fancy} +\let\ps@@empty\ps@empty +\def\ps@@fancy{% +\ps@@empty % This is for amsbook/amsart, which do strange things with \topskip +\def\@mkboth{\protect\markboth}% +\def\@oddhead{\@fancyhead\fancy@Oolh\f@ncyolh\f@ncyoch\f@ncyorh\fancy@Oorh}% +\def\@oddfoot{\@fancyfoot\fancy@Oolf\f@ncyolf\f@ncyocf\f@ncyorf\fancy@Oorf}% +\def\@evenhead{\@fancyhead\fancy@Oelh\f@ncyelh\f@ncyech\f@ncyerh\fancy@Oerh}% +\def\@evenfoot{\@fancyfoot\fancy@Oelf\f@ncyelf\f@ncyecf\f@ncyerf\fancy@Oerf}% +} +% Default definitions for compatibility mode: +% These cause the header/footer to take the defined \headwidth as width +% And to shift in the direction of the marginpar area + +\def\fancy@Oolh{\if@reversemargin\hss\else\relax\fi} +\def\fancy@Oorh{\if@reversemargin\relax\else\hss\fi} +\let\fancy@Oelh\fancy@Oorh +\let\fancy@Oerh\fancy@Oolh + +\let\fancy@Oolf\fancy@Oolh +\let\fancy@Oorf\fancy@Oorh +\let\fancy@Oelf\fancy@Oelh +\let\fancy@Oerf\fancy@Oerh + +% New definitions for the use of \fancyhfoffset +% These calculate the \headwidth from \textwidth and the specified offsets. + +\def\fancy@offsolh{\headwidth=\textwidth\advance\headwidth\f@ncyO@olh + \advance\headwidth\f@ncyO@orh\hskip-\f@ncyO@olh} +\def\fancy@offselh{\headwidth=\textwidth\advance\headwidth\f@ncyO@elh + \advance\headwidth\f@ncyO@erh\hskip-\f@ncyO@elh} + +\def\fancy@offsolf{\headwidth=\textwidth\advance\headwidth\f@ncyO@olf + \advance\headwidth\f@ncyO@orf\hskip-\f@ncyO@olf} +\def\fancy@offself{\headwidth=\textwidth\advance\headwidth\f@ncyO@elf + \advance\headwidth\f@ncyO@erf\hskip-\f@ncyO@elf} + +\def\fancy@setoffs{% +% Just in case \let\headwidth\textwidth was used + \fancy@gbl\let\headwidth\fancy@headwidth + \fancy@gbl\let\fancy@Oolh\fancy@offsolh + \fancy@gbl\let\fancy@Oelh\fancy@offselh + \fancy@gbl\let\fancy@Oorh\hss + \fancy@gbl\let\fancy@Oerh\hss + \fancy@gbl\let\fancy@Oolf\fancy@offsolf + \fancy@gbl\let\fancy@Oelf\fancy@offself + \fancy@gbl\let\fancy@Oorf\hss + \fancy@gbl\let\fancy@Oerf\hss} + +\newif\iffootnote +\let\latex@makecol\@makecol +\def\@makecol{\ifvoid\footins\footnotetrue\else\footnotefalse\fi +\let\topfloat\@toplist\let\botfloat\@botlist\latex@makecol} +\def\iftopfloat#1#2{\ifx\topfloat\empty #2\else #1\fi} +\def\ifbotfloat#1#2{\ifx\botfloat\empty #2\else #1\fi} +\def\iffloatpage#1#2{\if@fcolmade #1\else #2\fi} + +\newcommand{\fancypagestyle}[2]{% + \@namedef{ps@#1}{\let\fancy@gbl\relax#2\relax\ps@fancy}} diff --git a/report/additional-latex-files/natbib.sty b/report/additional-latex-files/natbib.sty new file mode 100644 index 00000000..ff0d0b91 --- /dev/null +++ b/report/additional-latex-files/natbib.sty @@ -0,0 +1,1246 @@ +%% +%% This is file `natbib.sty', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% natbib.dtx (with options: `package,all') +%% ============================================= +%% IMPORTANT NOTICE: +%% +%% This program can be redistributed and/or modified under the terms +%% of the LaTeX Project Public License Distributed from CTAN +%% archives in directory macros/latex/base/lppl.txt; either +%% version 1 of the License, or any later version. +%% +%% This is a generated file. +%% It may not be distributed without the original source file natbib.dtx. +%% +%% Full documentation can be obtained by LaTeXing that original file. +%% Only a few abbreviated comments remain here to describe the usage. +%% ============================================= +%% Copyright 1993-2009 Patrick W Daly +%% Max-Planck-Institut f\"ur Sonnensystemforschung +%% Max-Planck-Str. 2 +%% D-37191 Katlenburg-Lindau +%% Germany +%% E-mail: daly@mps.mpg.de +\NeedsTeXFormat{LaTeX2e}[1995/06/01] +\ProvidesPackage{natbib} + [2009/07/16 8.31 (PWD, AO)] + + % This package reimplements the LaTeX \cite command to be used for various + % citation styles, both author-year and numerical. It accepts BibTeX + % output intended for many other packages, and therefore acts as a + % general, all-purpose citation-style interface. + % + % With standard numerical .bst files, only numerical citations are + % possible. With an author-year .bst file, both numerical and + % author-year citations are possible. + % + % If author-year citations are selected, \bibitem must have one of the + % following forms: + % \bibitem[Jones et al.(1990)]{key}... + % \bibitem[Jones et al.(1990)Jones, Baker, and Williams]{key}... + % \bibitem[Jones et al., 1990]{key}... + % \bibitem[\protect\citeauthoryear{Jones, Baker, and Williams}{Jones + % et al.}{1990}]{key}... + % \bibitem[\protect\citeauthoryear{Jones et al.}{1990}]{key}... + % \bibitem[\protect\astroncite{Jones et al.}{1990}]{key}... + % \bibitem[\protect\citename{Jones et al., }1990]{key}... + % \harvarditem[Jones et al.]{Jones, Baker, and Williams}{1990}{key}... + % + % This is either to be made up manually, or to be generated by an + % appropriate .bst file with BibTeX. + % Author-year mode || Numerical mode + % Then, \citet{key} ==>> Jones et al. (1990) || Jones et al. [21] + % \citep{key} ==>> (Jones et al., 1990) || [21] + % Multiple citations as normal: + % \citep{key1,key2} ==>> (Jones et al., 1990; Smith, 1989) || [21,24] + % or (Jones et al., 1990, 1991) || [21,24] + % or (Jones et al., 1990a,b) || [21,24] + % \cite{key} is the equivalent of \citet{key} in author-year mode + % and of \citep{key} in numerical mode + % Full author lists may be forced with \citet* or \citep*, e.g. + % \citep*{key} ==>> (Jones, Baker, and Williams, 1990) + % Optional notes as: + % \citep[chap. 2]{key} ==>> (Jones et al., 1990, chap. 2) + % \citep[e.g.,][]{key} ==>> (e.g., Jones et al., 1990) + % \citep[see][pg. 34]{key}==>> (see Jones et al., 1990, pg. 34) + % (Note: in standard LaTeX, only one note is allowed, after the ref. + % Here, one note is like the standard, two make pre- and post-notes.) + % \citealt{key} ==>> Jones et al. 1990 + % \citealt*{key} ==>> Jones, Baker, and Williams 1990 + % \citealp{key} ==>> Jones et al., 1990 + % \citealp*{key} ==>> Jones, Baker, and Williams, 1990 + % Additional citation possibilities (both author-year and numerical modes) + % \citeauthor{key} ==>> Jones et al. + % \citeauthor*{key} ==>> Jones, Baker, and Williams + % \citeyear{key} ==>> 1990 + % \citeyearpar{key} ==>> (1990) + % \citetext{priv. comm.} ==>> (priv. comm.) + % \citenum{key} ==>> 11 [non-superscripted] + % Note: full author lists depends on whether the bib style supports them; + % if not, the abbreviated list is printed even when full requested. + % + % For names like della Robbia at the start of a sentence, use + % \Citet{dRob98} ==>> Della Robbia (1998) + % \Citep{dRob98} ==>> (Della Robbia, 1998) + % \Citeauthor{dRob98} ==>> Della Robbia + % + % + % Citation aliasing is achieved with + % \defcitealias{key}{text} + % \citetalias{key} ==>> text + % \citepalias{key} ==>> (text) + % + % Defining the citation mode and punctual (citation style) + % \setcitestyle{} + % Example: \setcitestyle{square,semicolon} + % Alternatively: + % Use \bibpunct with 6 mandatory arguments: + % 1. opening bracket for citation + % 2. closing bracket + % 3. citation separator (for multiple citations in one \cite) + % 4. the letter n for numerical styles, s for superscripts + % else anything for author-year + % 5. punctuation between authors and date + % 6. punctuation between years (or numbers) when common authors missing + % One optional argument is the character coming before post-notes. It + % appears in square braces before all other arguments. May be left off. + % Example (and default) \bibpunct[, ]{(}{)}{;}{a}{,}{,} + % + % To make this automatic for a given bib style, named newbib, say, make + % a local configuration file, natbib.cfg, with the definition + % \newcommand{\bibstyle@newbib}{\bibpunct...} + % Then the \bibliographystyle{newbib} will cause \bibstyle@newbib to + % be called on THE NEXT LATEX RUN (via the aux file). + % + % Such preprogrammed definitions may be invoked anywhere in the text + % by calling \citestyle{newbib}. This is only useful if the style specified + % differs from that in \bibliographystyle. + % + % With \citeindextrue and \citeindexfalse, one can control whether the + % \cite commands make an automatic entry of the citation in the .idx + % indexing file. For this, \makeindex must also be given in the preamble. + % + % Package Options: (for selecting punctuation) + % round - round parentheses are used (default) + % square - square brackets are used [option] + % curly - curly braces are used {option} + % angle - angle brackets are used