From 33e744f33064c180102f614f65ca61cf926a731f Mon Sep 17 00:00:00 2001 From: Christian Versloot Date: Tue, 1 Dec 2020 19:37:10 +0100 Subject: [PATCH 1/3] Add USPS dataset, #19 --- README.md | 16 +++++ assets/usps.png | Bin 0 -> 6137 bytes extra_keras_datasets/__init__.py | 3 +- extra_keras_datasets/usps.py | 103 +++++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 assets/usps.png create mode 100644 extra_keras_datasets/usps.py diff --git a/README.md b/README.md index 2c207f9..2f7b597 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ _The names TensorFlow, Keras, as well as related names, marks, emblems and image * [STL-10](#stl-10) * [Iris](#iris) * [Wine Quality dataset](#wine-quality-dataset) + * [USPS Handwritten Digits Dataset](#usps-handwritten-digits-dataset) - [Contributors and other references](#contributors-and-other-references) - [License](#license) @@ -206,6 +207,21 @@ from extra_keras_datasets import wine_quality --- +### USPS Handwritten Digits Dataset +This dataset presents thousands of 16x16 grayscale images of handwritten digits, generated from real USPS based mail. + +* Input structure: 16x16 image +* Target structure: digit ranging from 0.0 - 9.0 describing the input + +``` +from extra_keras_datasets import usps +(input_train, target_train), (input_test, target_test) = usps.load_data() +``` + + + +=== + ## Contributors and other references * **EMNIST dataset:** * Cohen, G., Afshar, S., Tapson, J., & van Schaik, A. (2017). EMNIST: an extension of MNIST to handwritten letters. Retrieved from http://arxiv.org/abs/1702.05373 diff --git a/assets/usps.png b/assets/usps.png new file mode 100644 index 0000000000000000000000000000000000000000..16a2ebebc065e7095100413ae69a7127852b2a5e GIT binary patch literal 6137 zcmd^@eO!`vzsKF_%y!qRv%Ax5wQ{SwT!y-6YH4V-9we3)f+nWc>bgupDFrPEX|-y# znUw+|sclKd4xlkVm=Yq#_D-qp~OByeyQZQ~AFBo(G*?oNZ;Qg+k!Q>Zjb(zl;wKxyeo z)YO%}D|}a?KHEp5?GN(v`=9f^)U-suBW^1;xVd>mV>hfL9K1N7!-|O}sr$3?u&%|~ zmp+q@x+_2XVNw2n)a_I3-L*tSa36OYU-e1iS;aqQmwuS%zWrSI!}X^=TZCzy_QBs7 zkAv&zk@PHTg-wBBHyH(N)uS;DXBNc55o?Aw zqq@#bx3lnXN*Vda}bbL}!NoGD!3`124= zM^!21+3oKzsq%~gQf*6*lBYzmyk>iaqr`@`5}x+q&tEJ1M_NMt#6GcAd0J6qiK2h< z7a{${$$(wAkDvuB_H5!yt;SL)11u*;8#*zZgJup%EsFlT$EQPF%~6f!)ZQo8cS8YG zeLb2v{HuqOVlt|sfYi>Kh1DhWtRcM=GC$hv$AlSyt*NhFiK<=D zdw*b+r~8a~1UN|-cYW77K4xO98>-s-;QXX+s?BqzXUI-?egNeQaY{!OM>{^!t3>H6 z61rFUM+W(!Nr#1}U4E?|4^Q{-MTy(vndOZGf`AlnOD)ZS=j)bD$cqvm*pL1zg}web zlR;$cAv5Ni^Zm(;&WYa^=j7S!eF>A^;?KW6kr%_}=`RJrstf4Y%48xRW9dwXxcU!^ z^l)qeF8W%$j|EGO-^$K3E&V;bq^;JXXDa(&H3=YdbMR1J3}gORU0oQ<<7ycwFIT%K z2NMO0KKcv_Ozr$aEPE5ByCrKb2(DITjR?B(wA20Sx50*{*Xr1ZZM{3{4<6u4?QfJ& zRbopaI#!;=;+Mx(SEyz#T(bi<>`ca1lgSdQ{S8l!64NL{vbx?bk!<{xAgxG<<8fmc zr3V@AS{aMUkE=}+s;tKzONMVxnU$w)iwKQ_iG8?|$ZK`g-6_^>!_(if8@GuYQh46k zgLOkPH1owNFGU#=ETQ*B)wVppcT$N`r))3v{xPT^y!UA(845^wdJB!?tP&NLAe#(T zu#dXGSi&I)xyH0swix+0Nn3F@522l>T}09s5&MhtOU^Cc%+-1%9NrF>L=^doX)|=# z)*D;Z$67*c6u#qyhx`jMA;#1jI$5hS4|KG5n`wTeDoPAr81#Vfe&C*lpU}aXgFE@c z^kwB3A8fDU_eYnN6n*_6DB!@0`)Heeq(w>T)09I|hg@2H^vgylfIVRc7kSvQP!o;J z&AoGMDo^|JL9UWw9~*|u&rf=CV@zdF21o_Xmej(eiO5Y|4+9IWz_gytOk6wF&FzT$ z+=uT0n|sfl1`h5!@}AT5=IwfW>S|2))N{z9hMzu(go$yalH9&bWbzd#UhJTlX&D5+ zi*ZPs7dad*HKZJ6M-)@ ziscigYfx3HPJ#~J|7>keN81e+XeF?CnV`SfC@}%UOBS9F?uqAw^d6t?p`eVsYJ#E` zv;%RkKKS~XMNe%l#`c8F!i;X?`^4NVR-KqJKMH0Ew}@j6vU5fspw&27^r9Mx?WY-; zY(^?}GK%3IURh|U{R&f5+>0Lk)WO^cs&r)2KPlZGr;;^e>pt`hsaJjMLn6R_^{?0S zoAUijRmc`3P=D}pr>o{o-n={)&adjeak!khN7SMjXVP2i7F(2UVvk5Bz@i1#WweALFL0lS5U-{qsU=FStGqI ze&Uis2BK=M9o58AIl#kgivDn9Xa6QdgY)Nz6jU`rgv_t+&9c;BFm1(5MInZ%=pQw- zL7MiQKKA*+WrmP~!j*{RmeZ2=KVQ!$T3(EzE*KZF9eoKr?fE4*1x1m!3^L!D>IqCb zLzY{Ju?4?tJ&(w`9Ct0}k+h`|g*)MvH6!2f%j2aI6@Et3d}{r67{qdFPDe{oXu@Em zdOx`J=5JrzKLv4-?Pokg<6+V1<-`Z0CkQFr64a zBv=HaSlnQ&Bp`JBWiHy*Ry09?Me%;>7I8Xp!AMlB_07c^YjsYrrx%lgoGdVG2-#7^Y1`0zt`<@dM zl+Y@VR+kCWzy0;}9LVgi}~_PsS(kl6SSy1<2ge?z4m2MR1Dou?RAc6iwEvWE0lDx2dkf znl55Vv61vdQfB6mJ`UEc!F@PCpsdqNXKBS4K^{Te=2IFCJL_Q9x_=cq2Y+OS=IPf4 zfn`QB-#qYAqNMLxoO!|3r!j#H(SqPipw70Ampq@?DA+LLqM`hSVa6|p@|(dK{a=Vj zSvuci?lZi_qk&hd^Ys#Xsj9Vu*w%r)mLQLZ)q?2B@fJ4McGy1L2(HQ!9e+ru!pE+) zhT|9C2C1wp3oPyqffRY@J~L@cpA!7CvW&q@X9%(asmF(CK?N)JG~{WIzCT@olK#3z zY6zAdukklXwo=&{7qVs3saqeN9m|a0XHDi4Rq-|ishKN%ueFHVD9kPZ4#l*k{Z4V#<@HJ=Sv0E1> zIU+2MTN6U_ccWd!)YtEhK$=;)^a$3INZk{s|Dn_opN1T+QH2)-5n<8OS3+LA@qq2# ztk9i<>Cf*T2acH_3P^U!mMhC8+|kT`)l!rI|LEc8D<0Jy9`E5nRp3_8Hek$(GQpNB z!2Ni(Vgw=FJC96b+d8^HgIjU!cU@RijYIBZ$zj%1eB8sU+b=69P9NkVGyve8-DZN# zA8>oc%?G(&ytj~NsPPQWC{vM2w;449SN{{&*LQ4I$dhjKoaIq@wqm2STF)p|;kU*I zO|?mGRTcL;v5kv7b^)duvqa4Tdo?P&e-v@J-S85`p^mHh!C|xTVyIz_8Kb{Hg1Ck6 zx2;?UE~q?d2~i@UsXnui`p^=JJF@a$M}uW z$UQ7)qbvS}`4u3aK~V%w@Z{@!{p9vEzN0@ebh+56tQchOmhg+Kw!@-%LWI1o zbgI89sqG3H*E#)MlgI?*ad5{}nJ1y&w)Y(ywq8j}D+Vp5BMdDKQA({xoSv3wz2cFj{h z`psGIO4s;i&|gT#v^k`L?z7%cT37xCa7odBZu7!r;zmz)y{c7uHPE;^m(~QP9q?}T zxidDPdS#g)|0XTG#J^W&8(c0X==f(+knaZi$T;?Mk3S*JKl9d0a5g*F_z6CM^5i-i zXZqYV6=HLjaMXAI*_~rb6wx6}e+^UuW=e)Bxi-RD5sBG_d=i}YGK{Ztg4mUaqIlRjDLQ>% z=qvSy1*_Aqs4|?UCNl1K!hpFmy~R}H=%C7dEQYFNv?E2d)USMYQj>8A54C;&x7ms^ z2^Zhgw%L;fLhER`z`G2F>>K;!06z^K}96nC;G9Z&|yQgo@T?N$Ub9jEq9f2CR`UV`K1X@5Jn7`V!< z;-I_wv&kn~GZ_x632n3XT?6Ge6f>&8>X~a;fL?k+0Gb1FG+yvNpa+9F8u<4PzOYgv zpTBDVF0gRdkcQ<2)h=z*Qq3mGWta7?Y*`Ztv&ZgvDo~i&6sJwLnxcs!dJuh1G_gZ# zc-ft=8BQ$?p6VRT4Ldzoe7k{fIzh<3AQR*X{m~0b2p`1~Wy0S3k1lWK#UQcYkd;pS zLJ@y^^j8wU?&K<9TeW zv%M6Q$q5HYo+X`u){wu@yLN#fkjm8#cQ-0snyaFA$i&fqhMQSMR&#$o}@s upm;E^UuqwJSDyc~IQrw6$p6(-c8`#b><+)Y%XUyIH|)mn4HfHm9sMt$9ea5I literal 0 HcmV?d00001 diff --git a/extra_keras_datasets/__init__.py b/extra_keras_datasets/__init__.py index 0918210..8bc2294 100644 --- a/extra_keras_datasets/__init__.py +++ b/extra_keras_datasets/__init__.py @@ -8,5 +8,6 @@ from . import stl10 from . import iris from . import wine_quality +from . import usps -__all__ = ['emnist', 'kmnist', 'svhn', 'stl10', 'iris', 'wine_quality'] +__all__ = ['emnist', 'kmnist', 'svhn', 'stl10', 'iris', 'wine_quality', 'usps'] diff --git a/extra_keras_datasets/usps.py b/extra_keras_datasets/usps.py new file mode 100644 index 0000000..9435e79 --- /dev/null +++ b/extra_keras_datasets/usps.py @@ -0,0 +1,103 @@ +""" + Import the USPS Handwritten Digits Dataset + Source: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ + multiclass.html#usps + (and: https://ieeexplore.ieee.org/document/291440) + Description: Handwritten text recognition image database. + + ~~~ Important note ~~~ + Please cite the following paper when using or referencing the dataset: + Hull, J. J. (1994). A database for handwritten text recognition + research. IEEE Transactions on pattern analysis and machine + intelligence, 16(5), 550-554. +""" + +from tensorflow.keras.utils import get_file +import logging +from sklearn.datasets import load_svmlight_file +import bz2 + + +def warn_citation(): + """Warns about citation requirements + # Returns + Void + """ + logging.warning(("Please cite the following paper when using or" + " referencing this Extra Keras Dataset:")) + logging.warning( + ("Hull, J. J. (1994). A database for handwritten text " + "recognition research. IEEE Transactions on pattern analysis and " + "machine intelligence, 16(5), 550-554.") + ) + + +def decompress(path): + """Decompresses BZ2 data into another file""" + bz_zip = bz2.BZ2File(path) + decompressed_data = bz_zip.read() + new_path = path[:-4] + open(new_path, 'wb').write(decompressed_data) + return new_path + + +def load_to_numpy(path): + """Loads LIBSVM data into NumPY format""" + data = load_svmlight_file(path) + return (data[0].toarray(), data[1]) + + +def load_data( + path="usps.bz2", + path_testing="usps-testing.bz2" +): + """Loads the USPS Handwritten Digits Dataset. + # Arguments + path: path where to cache the USPS data locally + (relative to ~/.keras/datasets). + path_testing: path where to cache the USPS testing data locally + (relative to ~/.keras/datasets). + # Returns + Tuple of Numpy arrays: `(input_train, target_train), + (input_test, target_test)`. + Input structure: 16x16 image with a digit + Target structure: number in the 0.0 - 9.0 range + + """ + # Log about loading + logging.basicConfig(level=logging.INFO) + logging.info('Loading dataset = usps') + + # Download data + path = get_file( + path, + origin=("https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/" + "datasets/multiclass/usps.bz2") + ) + path_testing = get_file( + path_testing, + origin=("https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/" + "datasets/multiclass/usps.t.bz2") + ) + + # Decompress data + decompress_train = decompress(path) + decompress_test = decompress(path_testing) + + # Load LIBSVM data into NumPy array + (input_train, target_train) = load_to_numpy(decompress_train) + (input_test, target_test) = load_to_numpy(decompress_test) + + # Reshape data + input_train = input_train.reshape(input_train.shape[0], 16, 16) + input_test = input_test.reshape(input_test.shape[0], 16, 16) + + # Correct targets (e.g. number 3 is now returned as 4.0) + target_train = target_train - 1 + target_test = target_test - 1 + + # Warn about citation + warn_citation() + + # Return data + return (input_train, target_train), (input_test, target_test) From acd30f533c5d5e37c137c172f3479a50d70fadbc Mon Sep 17 00:00:00 2001 From: Christian Versloot Date: Tue, 1 Dec 2020 19:42:39 +0100 Subject: [PATCH 2/3] Fix small README issue --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2f7b597..0e77b7e 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ from extra_keras_datasets import usps -=== +--- ## Contributors and other references * **EMNIST dataset:** From d0ba71a0892ef343bac6dee872f94916dac2e9b8 Mon Sep 17 00:00:00 2001 From: Christian Versloot Date: Tue, 1 Dec 2020 19:51:32 +0100 Subject: [PATCH 3/3] Add citation for USPS dataset --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 0e77b7e..3eaa824 100644 --- a/README.md +++ b/README.md @@ -236,6 +236,8 @@ from extra_keras_datasets import usps * Fisher,R.A. "The use of multiple measurements in taxonomic problems" Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to Mathematical Statistics" (John Wiley, NY, 1950). * **Wine Quality dataset:** * P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009. +* **USPS Handwritten Digits Dataset** + * Hull, J. J. (1994). A database for handwritten text recognition research. IEEE Transactions on pattern analysis and machine intelligence, 16(5), 550-554. ## License The licenseable parts of this repository are licensed under a [MIT License](./LICENSE), so you're free to use this repo in your machine learning projects / blogs / exercises, and so on. Happy engineering! 🚀