diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..66e4657 --- /dev/null +++ b/Pipfile @@ -0,0 +1,11 @@ +[[source]] +url = "https://pypi.python.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +"e1839a8" = {path = ".", extras = ["nlp", "s3driver"], editable = true} + +[dev-packages] +"pytest-flake8" = "*" +ipython = "*" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..75fd26b --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,455 @@ +{ + "_meta": { + "hash": { + "sha256": "02bcc79cf52a20e5172c477a0efb44d8d0d455235abefd4bfbc641e8c2453af2" + }, + "pipfile-spec": 6, + "requires": {}, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.python.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "boto3": { + "hashes": [ + "sha256:95ac50b1905e0aa0344a2a733d76c44af81b2cc51304386b94b0ef669d8d19bc", + "sha256:b227764ab3dcb4b55d54dd90c7676846f153b1e29ed259081ffc34b064a6ff21" + ], + "version": "==1.8.5" + }, + "botocore": { + "hashes": [ + "sha256:4a2d4fc68fdc7113957cfc51b733a9900a9ba35e19e6d841a8b11fd6c20732f9", + "sha256:dcad4db0349dd11278d094a91434faf11500aae1991890a62d47a79923ca7ba3" + ], + "version": "==1.11.5" + }, + "certifi": { + "hashes": [ + "sha256:376690d6f16d32f9d1fe8932551d80b23e9d393a8578c5633a2ed39a64861638", + "sha256:456048c7e371c089d0a77a5212fb37a2c2dce1e24146e3b7e0261736aaeaa22a" + ], + "version": "==2018.8.24" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "version": "==3.0.4" + }, + "decorator": { + "hashes": [ + "sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82", + "sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c" + ], + "version": "==4.3.0" + }, + "docutils": { + "hashes": [ + "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6", + "sha256:51e64ef2ebfb29cae1faa133b3710143496eca21c530f3f71424d77687764274", + "sha256:7a4bd47eaf6596e1295ecb11361139febe29b084a87bf005bf899f9a42edc3c6" + ], + "version": "==0.14" + }, + "e1839a8": { + "editable": true, + "extras": [ + "nlp", + "s3driver" + ], + "path": "." + }, + "idna": { + "hashes": [ + "sha256:156a6814fb5ac1fc6850fb002e0852d56c0c8d2531923a51032d1b70760e186e", + "sha256:684a38a6f903c1d71d6d5fac066b58d7768af4de2b832e426ec79c30daa94a16" + ], + "version": "==2.7" + }, + "jmespath": { + "hashes": [ + "sha256:6a81d4c9aa62caf061cb517b4d9ad1dd300374cd4706997aff9cd6aedd61fc64", + "sha256:f11b4461f425740a1d908e9a3f7365c3d2e569f6ca68a2ff8bc5bcd9676edd63" + ], + "version": "==0.9.3" + }, + "joblib": { + "hashes": [ + "sha256:333b9bf16ff015d6b56bf80b9831afdd243443cb84c7ff7b6e342f117e354c42", + "sha256:3e650621a6ec2b9cdda72ec3e0b0f04101f605a56ae0d0e54e3d18b16fcf29f4" + ], + "version": "==0.12.3" + }, + "nltk": { + "hashes": [ + "sha256:fe0eda251be65843be86d7de9abfbf7161732256f742e623b21243ec47bdb718" + ], + "version": "==3.3.0" + }, + "numpy": { + "hashes": [ + "sha256:1c362ad12dd09a43b348bb28dd2295dd9cdf77f41f0f45965e04ba97f525b864", + "sha256:2156a06bd407918df4ac0122df6497a9c137432118f585e5b17d543e593d1587", + "sha256:24e4149c38489b51fc774b1e1faa9103e82f73344d7a00ba66f6845ab4769f3f", + "sha256:340ec1697d9bb3a9c464028af7a54245298502e91178bddb4c37626d36e197b7", + "sha256:35db8d419345caa4eeaa65cd63f34a15208acd87530a30f0bc25fc84f55c8c80", + "sha256:361370e9b7f5e44c41eee29f2bb5cb3b755abb4b038bce6d6cbe08db7ff9cb74", + "sha256:36e8dcd1813ca92ce7e4299120cee6c03adad33d89b54862c1b1a100443ac399", + "sha256:378378973546ecc1dfaf9e24c160d683dd04df871ecd2dcc86ce658ca20f92c0", + "sha256:419e6faee16097124ee627ed31572c7e80a1070efa25260b78097cca240e219a", + "sha256:4287104c24e6a09b9b418761a1e7b1bbde65105f110690ca46a23600a3c606b8", + "sha256:549f3e9778b148a47f4fb4682955ed88057eb627c9fe5467f33507c536deda9d", + "sha256:5e359e9c531075220785603e5966eef20ccae9b3b6b8a06fdfb66c084361ce92", + "sha256:5ee7f3dbbdba0da75dec7e94bd7a2b10fe57a83e1b38e678200a6ad8e7b14fdc", + "sha256:62d55e96ec7b117d3d5e618c15efcf769e70a6effaee5842857b64fb4883887a", + "sha256:719b6789acb2bc86ea9b33a701d7c43dc2fc56d95107fd3c5b0a8230164d4dfb", + "sha256:7a70f2b60d48828cba94a54a8776b61a9c2657a803d47f5785f8062e3a9c7c55", + "sha256:7b9e37f194f8bcdca8e9e6af92e2cbad79e360542effc2dd6b98d63955d8d8a3", + "sha256:83b8fc18261b70f45bece2d392537c93dc81eb6c539a16c9ac994c47fc79f09a", + "sha256:9473ad28375710ab18378e72b59422399b27e957e9339c413bf00793b4b12df0", + "sha256:95b085b253080e5d09f7826f5e27dce067bae813a132023a77b739614a29de6e", + "sha256:98b86c62c08c2e5dc98a9c856d4a95329d11b1c6058cb9b5191d5ea6891acd09", + "sha256:a3bd01d6d3ed3d7c06d7f9979ba5d68281f15383fafd53b81aa44b9191047cf8", + "sha256:c81a6afc1d2531a9ada50b58f8c36197f8418ef3d0611d4c1d7af93fdcda764f", + "sha256:ce75ed495a746e3e78cfa22a77096b3bff2eda995616cb7a542047f233091268", + "sha256:dae8618c0bcbfcf6cf91350f8abcdd84158323711566a8c5892b5c7f832af76f", + "sha256:df0b02c6705c5d1c25cc35c7b5d6b6f9b3b30833f9d178843397ae55ecc2eebb", + "sha256:e3660744cda0d94b90141cdd0db9308b958a372cfeee8d7188fdf5ad9108ea82", + "sha256:f2362d0ca3e16c37782c1054d7972b8ad2729169567e3f0f4e5dd3cdf85f188e" + ], + "version": "==1.15.1" + }, + "pandas": { + "hashes": [ + "sha256:11975fad9edbdb55f1a560d96f91830e83e29bed6ad5ebf506abda09818eaf60", + "sha256:12e13d127ca1b585dd6f6840d3fe3fa6e46c36a6afe2dbc5cb0b57032c902e31", + "sha256:1c87fcb201e1e06f66e23a61a5fea9eeebfe7204a66d99df24600e3f05168051", + "sha256:242e9900de758e137304ad4b5663c2eff0d798c2c3b891250bd0bd97144579da", + "sha256:26c903d0ae1542890cb9abadb4adcb18f356b14c2df46e4ff657ae640e3ac9e7", + "sha256:2e1e88f9d3e5f107b65b59cd29f141995597b035d17cc5537e58142038942e1a", + "sha256:31b7a48b344c14691a8e92765d4023f88902ba3e96e2e4d0364d3453cdfd50db", + "sha256:4fd07a932b4352f8a8973761ab4e84f965bf81cc750fb38e04f01088ab901cb8", + "sha256:5b24ca47acf69222e82530e89111dd9d14f9b970ab2cd3a1c2c78f0c4fbba4f4", + "sha256:647b3b916cc8f6aeba240c8171be3ab799c3c1b2ea179a3be0bd2712c4237553", + "sha256:66b060946046ca27c0e03e9bec9bba3e0b918bafff84c425ca2cc2e157ce121e", + "sha256:6efa9fa6e1434141df8872d0fa4226fc301b17aacf37429193f9d70b426ea28f", + "sha256:be4715c9d8367e51dbe6bc6d05e205b1ae234f0dc5465931014aa1c4af44c1ba", + "sha256:bea90da782d8e945fccfc958585210d23de374fa9294a9481ed2abcef637ebfc", + "sha256:d785fc08d6f4207437e900ffead930a61e634c5e4f980ba6d3dc03c9581748c7", + "sha256:de9559287c4fe8da56e8c3878d2374abc19d1ba2b807bfa7553e912a8e5ba87c", + "sha256:f4f98b190bb918ac0bc0e3dd2ab74ff3573da9f43106f6dba6385406912ec00f", + "sha256:f71f1a7e2d03758f6e957896ed696254e2bc83110ddbc6942018f1a232dd9dad", + "sha256:fb944c8f0b0ab5c1f7846c686bc4cdf8cde7224655c12edcd59d5212cd57bec0" + ], + "version": "==0.23.4" + }, + "python-dateutil": { + "hashes": [ + "sha256:1adb80e7a782c12e52ef9a8182bebeb73f1d7e24e374397af06fb4956c8dc5c0", + "sha256:e27001de32f627c22380a688bcc43ce83504a7bc5da472209b4c70f02829f0b8" + ], + "version": "==2.7.3" + }, + "pytz": { + "hashes": [ + "sha256:a061aa0a9e06881eb8b3b2b43f05b9439d6583c206d0a6c340ff72a7b6669053", + "sha256:ffb9ef1de172603304d9d2819af6f5ece76f2e85ec10692a524dd876e72bf277" + ], + "version": "==2018.5" + }, + "requests": { + "hashes": [ + "sha256:63b52e3c866428a224f97cab011de738c36aec0185aa91cfacd418b5d58911d1", + "sha256:ec22d826a36ed72a7358ff3fe56cbd4ba69dd7a6718ffd450ff0e9df7a47ce6a" + ], + "version": "==2.19.1" + }, + "s3transfer": { + "hashes": [ + "sha256:90dc18e028989c609146e241ea153250be451e05ecc0c2832565231dacdf59c1", + "sha256:c7a9ec356982d5e9ab2d4b46391a7d6a950e2b04c472419f5fdec70cc0ada72f" + ], + "version": "==0.1.13" + }, + "scikit-learn": { + "hashes": [ + "sha256:0a718b5ffbd5053fb3f9e1a2e20b7c4f256dd8035e246b907d3117d20bac0260", + "sha256:1725540b754a9967778e9385e1ee2c8db50d5ab70ed835c9f5e36002ffabc169", + "sha256:3e3ce307d7c5c5811658ba8686b24b571a8244eaafe707665ad601f400d5ce98", + "sha256:42ad71502237c9fe300ecf157f5a394df717789a2dde541dd7034b539c70bdcc", + "sha256:42cba716db197e0d1670e2fc13c4cc4a86d5c5358120ccfee6ec427b154e74ff", + "sha256:47b4090b7686642e41176becb7c42ef3cc665d7ee0db5e7ea5d307ec9779327e", + "sha256:51d99a08c8bf689cf60c9d8dca6e3d3e5f6d762def85ad735dcea11fb528a89b", + "sha256:5f7577fbb2399a4712e96cf0e786638168940a876c33735a1b5d5a86ba4b1370", + "sha256:66bfc2b6b15db1725d03ea657ec9184ff09dcbf1ecd834ef85f2edc2c9cbba97", + "sha256:69a34d389d9ca4687ad00af4e11d53686771f484c37366f68617ef656bab16ab", + "sha256:75297f3dd6685f01555f1bb75846995d45650af417280b69c81bf11b6987aed5", + "sha256:9ebb38ab1d0ee143982aed561811903ac6c1abb512ae2b9019b3b65bde63ffb9", + "sha256:a402c1484fe65df42d5dbc22a58e0695fe3afe2b0b229aee2a09c6d60ba8e5c2", + "sha256:aad6b9aac1617bd7efa0450643888bbd3410679a94bc8680d9863825686ef369", + "sha256:ad4db28d3dc16c01df75ed6efb72524537de3839a5d179fcf94094359fc72ec5", + "sha256:b276739a5f863ccacb61999a3067d0895ee291c95502929b2ae56ea1f882e888", + "sha256:b3dc88c4d2bcb26ffc5afe16d053ae28317d7d1de083651defcd5453a04f1563", + "sha256:b3e4681253e95da5aa5c231889a32b084fd997962bf8beda6f796bf422f734b2", + "sha256:c3d852d49d6c1710089d4513702099fa6f8e1aebfedf222319d80c47b0a195f8", + "sha256:c6612e7e43988b8b5e1957150449493a55f9c059de641083df7a964f86f2d1e7", + "sha256:c69e5c6051366a6ac9600d730276db939b1a205e42504ec0b8371f154b0058db", + "sha256:ce121baa8e85ec27c3065281657dcd78adaab7dcb046c7fe96ad4e5a9dcb6610", + "sha256:ed2a9a9bea6ec443b7effe5695c9c168b7bf9a67df6d880729760feda871b6a3", + "sha256:efd842d70b87e3ef3429c3149840b9189d4441ca951ab0cec62c94a964e219d9", + "sha256:f1428af5c381f6eef30ffbc7e047b7c713d4efa5d7bf5e57b62b3fc8d387044b", + "sha256:f6c7bf8cd4de1640b760b47f4d28deb26dbbf9acbe0194cdff54a898e190d872", + "sha256:f8329ac2160ad8bbbac6a507374685ceca3f24ca427fa9ee61a501280e1972d9", + "sha256:fefba2a43b92f8393366093b60efbe984a72a2b41cce16b4002005e4104ef938" + ], + "version": "==0.19.2" + }, + "scipy": { + "hashes": [ + "sha256:0611ee97296265af4a21164a5323f8c1b4e8e15c582d3dfa7610825900136bb7", + "sha256:08237eda23fd8e4e54838258b124f1cd141379a5f281b0a234ca99b38918c07a", + "sha256:0e645dbfc03f279e1946cf07c9c754c2a1859cb4a41c5f70b25f6b3a586b6dbd", + "sha256:0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3", + "sha256:108c16640849e5827e7d51023efb3bd79244098c3f21e4897a1007720cb7ce37", + "sha256:340ef70f5b0f4e2b4b43c8c8061165911bc6b2ad16f8de85d9774545e2c47463", + "sha256:3ad73dfc6f82e494195144bd3a129c7241e761179b7cb5c07b9a0ede99c686f3", + "sha256:3b243c77a822cd034dad53058d7c2abf80062aa6f4a32e9799c95d6391558631", + "sha256:404a00314e85eca9d46b80929571b938e97a143b4f2ddc2b2b3c91a4c4ead9c5", + "sha256:423b3ff76957d29d1cce1bc0d62ebaf9a3fdfaf62344e3fdec14619bb7b5ad3a", + "sha256:42d9149a2fff7affdd352d157fa5717033767857c11bd55aa4a519a44343dfef", + "sha256:625f25a6b7d795e8830cb70439453c9f163e6870e710ec99eba5722775b318f3", + "sha256:698c6409da58686f2df3d6f815491fd5b4c2de6817a45379517c92366eea208f", + "sha256:729f8f8363d32cebcb946de278324ab43d28096f36593be6281ca1ee86ce6559", + "sha256:8190770146a4c8ed5d330d5b5ad1c76251c63349d25c96b3094875b930c44692", + "sha256:878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1", + "sha256:8b984f0821577d889f3c7ca8445564175fb4ac7c7f9659b7c60bef95b2b70e76", + "sha256:8f841bbc21d3dad2111a94c490fb0a591b8612ffea86b8e5571746ae76a3deac", + "sha256:c22b27371b3866c92796e5d7907e914f0e58a36d3222c5d436ddd3f0e354227a", + "sha256:d0cdd5658b49a722783b8b4f61a6f1f9c75042d0e29a30ccb6cacc9b25f6d9e2", + "sha256:d40dc7f494b06dcee0d303e51a00451b2da6119acbeaccf8369f2d29e28917ac", + "sha256:d8491d4784aceb1f100ddb8e31239c54e4afab8d607928a9f7ef2469ec35ae01", + "sha256:dfc5080c38dde3f43d8fbb9c0539a7839683475226cf83e4b24363b227dfe552", + "sha256:e24e22c8d98d3c704bb3410bce9b69e122a8de487ad3dbfe9985d154e5c03a40", + "sha256:e7a01e53163818d56eabddcafdc2090e9daba178aad05516b20c6591c4811020", + "sha256:ee677635393414930541a096fc8e61634304bb0153e4e02b75685b11eba14cae", + "sha256:f0521af1b722265d824d6ad055acfe9bd3341765735c44b5a4d0069e189a0f40", + "sha256:f25c281f12c0da726c6ed00535ca5d1622ec755c30a3f8eafef26cf43fede694" + ], + "version": "==1.1.0" + }, + "six": { + "hashes": [ + "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", + "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" + ], + "version": "==1.11.0" + }, + "sqlalchemy": { + "hashes": [ + "sha256:ef6569ad403520ee13e180e1bfd6ed71a0254192a934ec1dbd3dbf48f4aa9524" + ], + "version": "==1.2.11" + }, + "textblob": { + "hashes": [ + "sha256:7c9ff21a47a382fa4f235e84ce9be10cca4b9d46b012b79af6e47ea81b478a18", + "sha256:8301812cbef9b2f288e14df904854f7457fccf2c52020b66d3f9bc1448cf042a" + ], + "version": "==0.15.1" + }, + "urllib3": { + "hashes": [ + "sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf", + "sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5" + ], + "version": "==1.23" + } + }, + "develop": { + "atomicwrites": { + "hashes": [ + "sha256:0312ad34fcad8fac3704d441f7b317e50af620823353ec657a53e981f92920c0", + "sha256:ec9ae8adaae229e4f8446952d204a3e4b5fdd2d099f9be3aaf556120135fb3ee" + ], + "version": "==1.2.1" + }, + "attrs": { + "hashes": [ + "sha256:4b90b09eeeb9b88c35bc642cbac057e45a5fd85367b985bd2809c62b7b939265", + "sha256:e0d0eb91441a3b53dab4d9b743eafc1ac44476296a2053b6ca3af0b139faf87b" + ], + "version": "==18.1.0" + }, + "backcall": { + "hashes": [ + "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4", + "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2" + ], + "version": "==0.1.0" + }, + "colorama": { + "hashes": [ + "sha256:463f8483208e921368c9f306094eb6f725c6ca42b0f97e313cb5d5512459feda", + "sha256:48eb22f4f8461b1df5734a074b57042430fb06e1d61bd1e11b078c0fe6d7a1f1" + ], + "markers": "sys_platform == 'win32'", + "version": "==0.3.9" + }, + "decorator": { + "hashes": [ + "sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82", + "sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c" + ], + "version": "==4.3.0" + }, + "flake8": { + "hashes": [ + "sha256:7253265f7abd8b313e3892944044a365e3f4ac3fcdcfb4298f55ee9ddf188ba0", + "sha256:c7841163e2b576d435799169b78703ad6ac1bbb0f199994fc05f700b2a90ea37" + ], + "version": "==3.5.0" + }, + "ipython": { + "hashes": [ + "sha256:007dcd929c14631f83daff35df0147ea51d1af420da303fd078343878bd5fb62", + "sha256:b0f2ef9eada4a68ef63ee10b6dde4f35c840035c50fd24265f8052c98947d5a4" + ], + "index": "pypi", + "version": "==6.5.0" + }, + "ipython-genutils": { + "hashes": [ + "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8", + "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8" + ], + "version": "==0.2.0" + }, + "jedi": { + "hashes": [ + "sha256:b409ed0f6913a701ed474a614a3bb46e6953639033e31f769ca7581da5bd1ec1", + "sha256:c254b135fb39ad76e78d4d8f92765ebc9bf92cbc76f49e97ade1d5f5121e1f6f" + ], + "version": "==0.12.1" + }, + "mccabe": { + "hashes": [ + "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", + "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f" + ], + "version": "==0.6.1" + }, + "more-itertools": { + "hashes": [ + "sha256:c187a73da93e7a8acc0001572aebc7e3c69daf7bf6881a2cea10650bd4420092", + "sha256:c476b5d3a34e12d40130bc2f935028b5f636df8f372dc2c1c01dc19681b2039e", + "sha256:fcbfeaea0be121980e15bc97b3817b5202ca73d0eae185b4550cbfce2a3ebb3d" + ], + "version": "==4.3.0" + }, + "parso": { + "hashes": [ + "sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2", + "sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24" + ], + "version": "==0.3.1" + }, + "pickleshare": { + "hashes": [ + "sha256:84a9257227dfdd6fe1b4be1319096c20eb85ff1e82c7932f36efccfe1b09737b", + "sha256:c9a2541f25aeabc070f12f452e1f2a8eae2abd51e1cd19e8430402bdf4c1d8b5" + ], + "version": "==0.7.4" + }, + "pluggy": { + "hashes": [ + "sha256:6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1", + "sha256:95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1" + ], + "version": "==0.7.1" + }, + "prompt-toolkit": { + "hashes": [ + "sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381", + "sha256:3f473ae040ddaa52b52f97f6b4a493cfa9f5920c255a12dc56a7d34397a398a4", + "sha256:858588f1983ca497f1cf4ffde01d978a3ea02b01c8a26a8bbc5cd2e66d816917" + ], + "version": "==1.0.15" + }, + "py": { + "hashes": [ + "sha256:06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1", + "sha256:50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6" + ], + "version": "==1.6.0" + }, + "pycodestyle": { + "hashes": [ + "sha256:682256a5b318149ca0d2a9185d365d8864a768a28db66a84a2ea946bcc426766", + "sha256:6c4245ade1edfad79c3446fadfc96b0de2759662dc29d07d80a6f27ad1ca6ba9" + ], + "version": "==2.3.1" + }, + "pyflakes": { + "hashes": [ + "sha256:08bd6a50edf8cffa9fa09a463063c425ecaaf10d1eb0335a7e8b1401aef89e6f", + "sha256:8d616a382f243dbf19b54743f280b80198be0bca3a5396f1d2e1fca6223e8805" + ], + "version": "==1.6.0" + }, + "pygments": { + "hashes": [ + "sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d", + "sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc" + ], + "version": "==2.2.0" + }, + "pytest": { + "hashes": [ + "sha256:2d7c49e931316cc7d1638a3e5f54f5d7b4e5225972b3c9838f3584788d27f349", + "sha256:ad0c7db7b5d4081631e0155f5c61b80ad76ce148551aaafe3a718d65a7508b18" + ], + "version": "==3.7.4" + }, + "pytest-flake8": { + "hashes": [ + "sha256:4f30f5be3efb89755f38f11bdb2a5e22d19a6f5faa73428f703a3292a9572cd3", + "sha256:c740ad6aa19e3958947d2118f70bed218caf1d2097039fb7318573a2a72f89a1" + ], + "index": "pypi", + "version": "==1.0.2" + }, + "simplegeneric": { + "hashes": [ + "sha256:dc972e06094b9af5b855b3df4a646395e43d1c9d0d39ed345b7393560d0b9173" + ], + "version": "==0.8.1" + }, + "six": { + "hashes": [ + "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", + "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" + ], + "version": "==1.11.0" + }, + "traitlets": { + "hashes": [ + "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835", + "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9" + ], + "version": "==4.3.2" + }, + "wcwidth": { + "hashes": [ + "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", + "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + ], + "version": "==0.1.7" + } + } +} diff --git a/quantgov/__init__.py b/quantgov/__init__.py index c699782..37f7d26 100644 --- a/quantgov/__init__.py +++ b/quantgov/__init__.py @@ -1,16 +1,7 @@ from __future__ import (absolute_import, division, print_function, unicode_literals) -__all__ = [ - 'corpora', - 'corpus', - 'estimator', - 'project', - 'utils', -] - -from . import corpora # Backwards compatibility - +from . import corpus, nlp, ml, utils from .utils import load_driver -__version__ = '0.4.2' +__version__ = '0.5.0' diff --git a/quantgov/__main__.py b/quantgov/__main__.py index 22428fc..a57163f 100644 --- a/quantgov/__main__.py +++ b/quantgov/__main__.py @@ -11,11 +11,10 @@ import sys import zipfile -import joblib as jl import requests +import joblib as jl import quantgov -import quantgov.corpus.builtins from pathlib import Path @@ -37,11 +36,11 @@ def parse_args(): create.add_argument('path', type=Path) create.add_argument('--parent', default='master') - # Corpus command - corpus = subparsers.add_parser('corpus') - corpus_subcommands = corpus.add_subparsers(dest='subcommand') - for command, builtin in quantgov.corpus.builtins.commands.items(): - subcommand = corpus_subcommands.add_parser( + # NLP command + nlp_subparser = subparsers.add_parser('nlp') + nlp_subcommands = nlp_subparser.add_subparsers(dest='subcommand') + for command, builtin in quantgov.nlp.commands.items(): + subcommand = nlp_subcommands.add_parser( command, help=builtin.cli.help) subcommand.add_argument( 'corpus', help='Path to a QuantGov Corpus directory') @@ -56,21 +55,24 @@ def parse_args(): default=sys.stdout ) - # Estimator Command - estimator = subparsers.add_parser('estimator') - estimator_subcommands = estimator.add_subparsers(dest='subcommand') + # ML Command + ml_parser = subparsers.add_parser('ml') + ml_subcommands = ml_parser.add_subparsers(dest='subcommand') - # Estimator Evaluate - evaluate = estimator_subcommands.add_parser( + # ML Evaluate + evaluate = ml_subcommands.add_parser( 'evaluate', help='Evaluate candidate models') evaluate.add_argument( 'modeldefs', type=Path, help='python module containing candidate models' ) evaluate.add_argument( - 'trainers', type=jl.load, help='saved Trainers object') + 'trainers', + type=quantgov.ml.Trainers.load, + help='saved Trainers object' + ) evaluate.add_argument( - 'labels', type=jl.load, help='saved Labels object') + 'labels', type=quantgov.ml.Labels.load, help='saved Labels object') evaluate.add_argument( 'output_results', type=lambda x: open(x, 'w', encoding=ENCODE_OUT), @@ -86,31 +88,36 @@ def parse_args(): help='Number of folds for cross-validation') evaluate.add_argument('--scoring', default='f1', help='scoring method') - # Estimator Train - train = estimator_subcommands.add_parser('train', help='Train a model') + # ML Train + train = ml_subcommands.add_parser('train', help='Train a model') train.add_argument( 'modeldefs', type=Path, help='Python module containing candidate models' ) train.add_argument('configfile', help='Model configuration file') train.add_argument( - 'trainers', type=jl.load, help='saved Trainers object') + 'vectorizer', + type=jl.load, + help='saved Vectorizer object' + ) + train.add_argument( + 'trainers', + type=quantgov.ml.Trainers.load, + help='saved Trainers object' + ) train.add_argument( - 'labels', type=jl.load, help='saved Labels object') + 'labels', type=quantgov.ml.Labels.load, help='saved Labels object') train.add_argument( - '-o', '--outfile', help='location to save the trained model' + '-o', '--outfile', help='location to save the trained Estimator' ) - # Estimator Estimate - estimate = estimator_subcommands.add_parser( + # ML Estimate + estimate = ml_subcommands.add_parser( 'estimate', help='Estimate label values for a target corpus') estimate.add_argument( - 'vectorizer', type=jl.load, - help='joblib-saved scikit-learn vectorizer' - ) - estimate.add_argument( - 'model', type=jl.load, - help='saved Model object' + 'estimator', + type=quantgov.ml.Estimator.load, + help='saved Estimator object' ) estimate.add_argument( 'corpus', type=quantgov.load_driver, @@ -164,7 +171,7 @@ def start_component(args): def run_corpus_builtin(args): driver = quantgov.load_driver(args.corpus) writer = csv.writer(args.outfile) - builtin = quantgov.corpus.builtins.commands[args.subcommand] + builtin = quantgov.nlp.commands[args.subcommand] func_args = {i: j for i, j in vars(args).items() if i not in {'command', 'subcommand', 'outfile', 'corpus'}} writer.writerow(driver.index_labels + builtin.get_columns(func_args)) @@ -179,18 +186,43 @@ def run_corpus_builtin(args): def run_estimator(args): if args.subcommand == "evaluate": - quantgov.estimator.evaluate( + quantgov.ml.evaluate( args.modeldefs, args.trainers, args.labels, args.folds, args.scoring, args.output_results, args.output_suggestion ) elif args.subcommand == "train": - quantgov.estimator.train_and_save_model( - args.modeldefs, args.configfile, args.trainers, args.labels, - args.outfile) + quantgov.ml.train_and_save_model( + args.modeldefs, args.configfile, args.vectorizer, args.trainers, + args.labels, args.outfile) elif args.subcommand == "estimate": - quantgov.estimator.estimate( - args.vectorizer, args.model, args.corpus, args.probability, - args.precision, args.outfile + writer = csv.writer(args.outfile) + labels = args.corpus.index_labels + if args.probability: + if args.estimator.multilabel: + if args.estimator.multiclass: + writer.writerow(labels + ('label', 'class', 'probability')) + else: + writer.writerow(labels + ('label', 'probability')) + elif args.estimator.multiclass: + writer.writerow(labels + ('class', 'probability')) + else: + writer.writerow( + labels + ('{}_prob'.format(args.estimator.label_names[0]),) + ) + else: + if args.estimator.multilabel: + writer.writerow(labels + ('label', 'prediction')) + else: + writer.writerow( + labels + ('{}'.format(args.estimator.label_names[0]),) + ) + writer.writerows( + docidx + result for docidx, + result in quantgov.ml.estimate( + args.estimator, + args.corpus, + args.probability, + args.precision) ) @@ -198,8 +230,8 @@ def main(): args = parse_args() { 'start': start_component, - 'corpus': run_corpus_builtin, - 'estimator': run_estimator + 'nlp': run_corpus_builtin, + 'ml': run_estimator, }[args.command](args) diff --git a/quantgov/corpora/__init__.py b/quantgov/corpora/__init__.py deleted file mode 100644 index da8e2b1..0000000 --- a/quantgov/corpora/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -import warnings - -from ..corpus import ( - Document, - CorpusStreamer, - CorpusDriver, - FlatFileCorpusDriver, - RecursiveDirectoryCorpusDriver, - NamePatternCorpusDriver, - IndexDriver, - S3Driver, - S3DatabaseDriver -) - -warnings.warn( - ("quantgov.corpora has been moved to quantgov.corpus and will be removed" - " in a future version."), - DeprecationWarning) diff --git a/quantgov/corpus/structures.py b/quantgov/corpus.py similarity index 96% rename from quantgov/corpus/structures.py rename to quantgov/corpus.py index 08e325e..c43c728 100644 --- a/quantgov/corpus/structures.py +++ b/quantgov/corpus.py @@ -1,5 +1,5 @@ """ -quantgov.corpora.structures +quantgov.corpus Classes for Writing QuantGov Corpora """ @@ -13,7 +13,7 @@ from collections import namedtuple from pathlib import Path -from .. import utils as qgutils +from . import utils as qgutils try: import boto3 @@ -286,17 +286,10 @@ def __init__(self, index, bucket, encoding='utf-8', cache=True): super(IndexDriver, self).__init__( index_labels=index_labels, encoding=encoding, cache=cache) - def gen_indices_and_paths(self): - with self.index.open(encoding=self.encoding) as inf: - reader = csv.reader(inf) - next(reader) - for row in reader: - yield tuple(row[:-1]), row[-1] - def read(self, docinfo): idx, path = docinfo body = self.client.get_object(Bucket=self.bucket, - Key=str(path))['Body'] + Key=str(path).replace('\\', '/'))['Body'] return Document(idx, body.read().decode(self.encoding)) def filter(self, pattern): diff --git a/quantgov/corpus/__init__.py b/quantgov/corpus/__init__.py deleted file mode 100644 index f095957..0000000 --- a/quantgov/corpus/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from .structures import ( - Document, - CorpusStreamer, - CorpusDriver, - FlatFileCorpusDriver, - RecursiveDirectoryCorpusDriver, - NamePatternCorpusDriver, - IndexDriver, - S3Driver, - S3DatabaseDriver -) diff --git a/quantgov/estimator/estimation.py b/quantgov/estimator/estimation.py deleted file mode 100644 index c77dd2d..0000000 --- a/quantgov/estimator/estimation.py +++ /dev/null @@ -1,244 +0,0 @@ -""" -quantgov.estimator.estimation - -Functionality for making predictions with an estimator -""" -import csv -import logging - -import sklearn.pipeline - -log = logging.getLogger(__name__) - - -def get_pipeline(vectorizer, model): - """ - Get the full estimation pipeline - - Arguments: - * vectorizer: a sklearn Vectorizer (or pipeline) - * model: a quantgov.estimator.Estimator - - Returns: a sklearn Pipeline - """ - return sklearn.pipeline.Pipeline(( - ('vectorizer', vectorizer), - ('model', model.model) - )) - - -def estimate_simple(vectorizer, model, streamer): - """ - Generate predictions for an estimator - - Arguments: - * vectorizer: a sklearn Vectorizer (or pipeline) - * model: a quantgov.estimator.Estimator - * streamer: a quantgov.corpora.CorpusStreamer - - Yields: - 2-tuples of docindex, prediction - - """ - pipeline = get_pipeline(vectorizer, model) - texts = (doc.text for doc in streamer) - yield from zip(streamer.index, pipeline.predict(texts)) - - -def estimate_probability(vectorizer, model, streamer, precision): - """ - Generate probabilities for a one-label estimator - - Arguments: - * vectorizer: a sklearn Vectorizer (or pipeline) - * model: a quantgov.estimator.Estimator - * streamer: a quantgov.corpora.CorpusStreamer - - Yields: - 2-tuples of docindex, probability - - """ - pipeline = get_pipeline(vectorizer, model) - texts = (doc.text for doc in streamer) - truecol = list(int(i) for i in model.model.classes_).index(1) - predicted = ( - i[truecol] for i in pipeline.predict_proba(texts).round(precision) - ) - yield from zip(streamer.index, predicted) - - -def estimate_probability_multilabel(vectorizer, model, streamer, precision): - """ - Generate probabilities for a multilabel binary estimator - - Arguments: - * vectorizer: a sklearn Vectorizer (or pipeline) - * model: a quantgov.estimator.Estimator - * streamer: a quantgov.corpora.CorpusStreamer - - Yields: - 2-tuples of docindex, probability - - """ - pipeline = get_pipeline(vectorizer, model) - texts = (doc.text for doc in streamer) - try: - truecols = tuple( - list(int(i) for i in label_classes).index(1) - for label_classes in model.model.classes_ - ) - except (AttributeError, TypeError): - truecols = tuple( - list(int(i) for i in label_classes).index(1) - for label_classes in ( - est.classes_ for est in model.model.steps[-1][-1].estimators_ - ) - ) - predicted = pipeline.predict_proba(texts) - try: - for i, docidx in enumerate(streamer.index): - yield docidx, tuple( - label_predictions[i, truecols[j]].round(int(precision)) - for j, label_predictions in enumerate(predicted)) - except IndexError: - yield from zip(streamer.index, predicted.round(int(precision))) - - -def estimate_probability_multiclass(vectorizer, model, streamer, precision): - """ - Generate probabilities for a one-label, multiclass estimator - - Arguments: - * vectorizer: a sklearn Vectorizer (or pipeline) - * model: a quantgov.estimator.Estimator - * streamer: a quantgov.corpora.CorpusStreamer - - Yields: - 2-tuples of docindex, probability - - """ - pipeline = get_pipeline(vectorizer, model) - texts = (doc.text for doc in streamer) - yield from zip( - streamer.index, - (i for i in pipeline.predict_proba(texts).round(precision)) - ) - - -def estimate_probability_multilabel_multiclass( - vectorizer, model, streamer, precision): - """ - Generate probabilities for a multilabel, multiclass estimator - - Arguments: - * vectorizer: a sklearn Vectorizer (or pipeline) - * model: a quantgov.estimator.Estimator - * streamer: a quantgov.corpora.CorpusStreamer - - Yields: - 2-tuples of docindex, probability - - """ - pipeline = get_pipeline(vectorizer, model) - texts = (doc.text for doc in streamer) - predicted = pipeline.predict_proba(texts) - for i, docidx in enumerate(streamer.index): - yield docidx, tuple(label_predictions[i] for label_predictions - in predicted.round(precision)) - - -def is_multiclass(classes): - """ - Returns True if values in classes are anything but 1, 0, True, or False, - otherwise returns False. - """ - try: - return len(set(int(i) for i in classes) - {0, 1}) != 0 - except ValueError: - return True - - -def estimate(vectorizer, model, corpus, probability, precision, outfile): - """ - Estimate label values for documents in corpus - - Arguments: - - * **vectorizer**: joblib-saved vectorizer - * **model**: saved `quantgov.estimator.Model` object - * **corpus**: path to a quantgov corpus - * **probability**: if True, predict probability - * **outfile**: open file object for writing results - """ - streamer = corpus.get_streamer() - writer = csv.writer(outfile) - if len(model.label_names) > 1: - multilabel = True - try: - multiclass = any(is_multiclass(i) for i in model.model.classes_) - except (AttributeError, TypeError): - multiclass = any( - is_multiclass(i.classes_) for i in - model.model.steps[-1][-1].estimators_ - ) - else: - multilabel = False - multiclass = is_multiclass(model.model.classes_) - - # TODO: This is very ugly and complicated and should probably be refactored - if probability: - if multilabel: - if multiclass: # Multilabel-multiclass probability - results = estimate_probability_multilabel_multiclass( - vectorizer, model, streamer, precision) - writer.writerow(corpus.index_labels + - ('label', 'class', 'probability')) - writer.writerows( - docidx + (label_name, class_name, prediction) - for docidx, predictions in results - for label_name, label_classes, label_predictions - in zip( - model.label_names, model.model.classes_, predictions) - for class_name, prediction - in zip(label_classes, label_predictions) - ) - else: # Multilabel probability - results = estimate_probability_multilabel( - vectorizer, model, streamer, precision) - writer.writerow(corpus.index_labels + ('label', 'probability')) - writer.writerows( - docidx + (label_name, prediction) - for docidx, predictions in results - for label_name, prediction - in zip(model.label_names, predictions) - ) - elif multiclass: # Multiclass probability - writer.writerow(corpus.index_labels + ('class', 'probability')) - results = estimate_probability_multiclass( - vectorizer, model, streamer, precision) - writer.writerows( - docidx + (class_name, prediction) - for docidx, predictions in results - for class_name, prediction in zip( - model.model.classes_, predictions) - ) - else: # Simple probability - results = estimate_probability( - vectorizer, model, streamer, precision) - writer.writerow( - corpus.index_labels + (model.label_names[0] + '_prob',)) - writer.writerows( - docidx + (prediction,) for docidx, prediction in results) - elif multilabel: # Multilabel Prediction - results = estimate_simple(vectorizer, model, streamer) - writer.writerow(corpus.index_labels + ('label', 'prediction')) - writer.writerows( - docidx + (label_name, prediction,) - for docidx, predictions in results - for label_name, prediction in zip(model.label_names, predictions) - ) - else: # Simple Prediction - results = estimate_simple(vectorizer, model, streamer) - writer.writerow(corpus.index_labels + model.label_names) - writer.writerows(docidx + (prediction,) - for docidx, prediction in results) diff --git a/quantgov/estimator/__init__.py b/quantgov/ml/__init__.py similarity index 95% rename from quantgov/estimator/__init__.py rename to quantgov/ml/__init__.py index 73ae5bf..77f8a1b 100644 --- a/quantgov/estimator/__init__.py +++ b/quantgov/ml/__init__.py @@ -9,7 +9,7 @@ from .structures import ( Labels, Trainers, - Model, + Estimator, CandidateModel ) diff --git a/quantgov/estimator/candidate_sets.py b/quantgov/ml/candidate_sets.py similarity index 89% rename from quantgov/estimator/candidate_sets.py rename to quantgov/ml/candidate_sets.py index 97978fa..3ad7817 100644 --- a/quantgov/estimator/candidate_sets.py +++ b/quantgov/ml/candidate_sets.py @@ -1,5 +1,5 @@ """ -quantgov.estimator.candidate_sets: Starter model candidate sets +quantgov.ml.candidate_sets: Starter model candidate sets This module provides a few sample sets of models for common problems. These are @@ -18,10 +18,10 @@ import sklearn.pipeline import sklearn.feature_extraction -import quantgov.estimator +import quantgov.ml classification = [ - quantgov.estimator.CandidateModel( + quantgov.ml.CandidateModel( name="Random Forests", model=sklearn.pipeline.Pipeline(steps=( ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()), @@ -31,7 +31,7 @@ 'rf__n_estimators': [5, 10, 25, 50, 100], } ), - quantgov.estimator.CandidateModel( + quantgov.ml.CandidateModel( name="Logistic Regression", model=sklearn.pipeline.Pipeline(steps=( ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()), @@ -45,7 +45,7 @@ multilabel_classification = [ - quantgov.estimator.CandidateModel( + quantgov.ml.CandidateModel( name="Random Forests", model=sklearn.pipeline.Pipeline(steps=( ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()), @@ -55,7 +55,7 @@ 'rf__n_estimators': [5, 10, 25, 50, 100], } ), - quantgov.estimator.CandidateModel( + quantgov.ml.CandidateModel( name="Logistic Regression", model=sklearn.pipeline.Pipeline(steps=( ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()), diff --git a/quantgov/ml/estimation.py b/quantgov/ml/estimation.py new file mode 100644 index 0000000..84601f6 --- /dev/null +++ b/quantgov/ml/estimation.py @@ -0,0 +1,181 @@ +""" +quantgov.ml.estimation + +Functionality for making predictions with an estimator +""" +import logging + +log = logging.getLogger(__name__) + + +def estimate_simple(estimator, streamer): + """ + Generate predictions for a one-label estimator + + Arguments: + * estimator: a quantgov.ml.Estimator + * streamer: a quantgov.corpora.CorpusStreamer + + Yields: + 2-tuples of docindex, (prediction,) + + """ + texts = (doc.text for doc in streamer) + predicted = estimator.pipeline.predict(texts) + for docidx, prediction in zip(streamer.index, predicted): + yield docidx, (prediction,) + + +def estimate_multilabel(estimator, streamer): + """ + Generate predictions for a multi-label estimator + + Arguments: + * estimator: a quantgov.ml.Estimator + * streamer: a quantgov.corpora.CorpusStreamer + + Yields: + 2-tuples of docindex, (label, prediction,) + + """ + for docidx, (prediction,) in estimate_simple(estimator, streamer): + for label, label_prediction in zip(estimator.label_names, prediction): + yield docidx, (label, label_prediction) + + +def estimate_probability(estimator, streamer, precision): + """ + Generate probabilities for a one-label estimator + + Arguments: + * estimator: a quantgov.ml.Estimator + * streamer: a quantgov.corpora.CorpusStreamer + + Yields: + 2-tuples of docindex, (probability,) + + """ + texts = (doc.text for doc in streamer) + truecol = list(int(i) for i in estimator.pipeline.classes_).index(1) + predicted = ( + estimator.pipeline.predict_proba(texts)[:, truecol].round(precision)) + yield from zip(streamer.index, ((prob,) for prob in predicted)) + + +def estimate_probability_multilabel(estimator, streamer, precision): + """ + Generate probabilities for a multilabel binary estimator + + Arguments: + * estimator: a quantgov.ml.Estimator + * streamer: a quantgov.corpora.CorpusStreamer + + Yields: + 2-tuples of docindex, (label, probability) + + """ + texts = (doc.text for doc in streamer) + model = estimator.pipeline.steps[-1][1] + try: + truecols = tuple( + list(int(i) for i in label_classes).index(1) + for label_classes in model.classes_ + ) + except (AttributeError, TypeError): + truecols = tuple( + list(int(i) for i in label_classes).index(1) + for label_classes in ( + est.classes_ for est in model.steps[-1][1].estimators_ + ) + ) + predicted = estimator.pipeline.predict_proba(texts).round(int(precision)) + + try: + yield from ( + (docidx, (label, label_prediction[truecol])) + for docidx, doc_predictions in zip(streamer.index, predicted) + for label, label_prediction, truecol + in zip(estimator.label_names, doc_predictions, truecols) + ) + except IndexError: + yield from ( + (docidx, (label, label_prediction)) + for docidx, doc_predictions in zip(streamer.index, predicted) + for (label, label_prediction) + in zip(estimator.label_names, doc_predictions) + ) + + +def estimate_probability_multiclass(estimator, streamer, precision): + """ + Generate probabilities for a one-label, multiclass estimator + + Arguments: + * estimator: a quantgov.ml.Estimator + * streamer: a quantgov.corpora.CorpusStreamer + + Yields: + 2-tuples of docindex, (class, probability) + + """ + texts = (doc.text for doc in streamer) + probs = estimator.pipeline.predict_proba(texts).round(precision) + yield from ( + (docidx, (class_, probability)) + for docidx, doc_probs in zip(streamer.index, probs) + for class_, probability in zip(estimator.pipeline.classes_, doc_probs) + ) + + +def estimate_probability_multilabel_multiclass(estimator, streamer, precision): + """ + Generate probabilities for a multilabel, multiclass estimator + + Arguments: + * estimator: a quantgov.ml.Estimator + * streamer: a quantgov.corpora.CorpusStreamer + + Yields: + 2-tuples of docindex, (label, class, probability + + """ + texts = (doc.text for doc in streamer) + probs = estimator.pipeline.predict_proba(texts) + yield from ( + (docidx, (label_name, class_, prob)) + for label_name, label_probs in zip(estimator.label_names, probs) + for docidx, doc_probs in zip(streamer.index, label_probs) + for class_, prob in zip(estimator.pipeline.classes_, doc_probs) + ) + + +def estimate(estimator, corpus, probability, precision=4): + """ + Estimate label values for documents in corpus + + Arguments: + + * **estimator**: path to a saved `quantgov.ml.Estimator` object + * **corpus**: path to a quantgov corpus + * **probability**: if True, predict probability + * **precision**: precision for probability prediction + """ + streamer = corpus.get_streamer() + if probability: + if estimator.multilabel: + if estimator.multiclass: # Multilabel-multiclass probability + yield from estimate_probability_multilabel_multiclass( + estimator, streamer, precision) + else: # Multilabel probability + yield from estimate_probability_multilabel( + estimator, streamer, precision) + elif estimator.multiclass: # Multiclass probability + yield from estimate_probability_multiclass( + estimator, streamer, precision) + else: # Simple probability + yield from estimate_probability( + estimator, streamer, precision) + elif estimator.multilabel: # Multilabel Prediction + yield from estimate_multilabel(estimator, streamer) + else: # Binary and Multiclass + yield from estimate_simple(estimator, streamer) diff --git a/quantgov/estimator/evaluation.py b/quantgov/ml/evaluation.py similarity index 92% rename from quantgov/estimator/evaluation.py rename to quantgov/ml/evaluation.py index 03b0914..e587053 100644 --- a/quantgov/estimator/evaluation.py +++ b/quantgov/ml/evaluation.py @@ -19,7 +19,7 @@ def evaluate_model(model, X, y, folds, scoring): Evaluate a single model Arguments: - * model: a quantgov.estimator.Model + * model: a quantgov.ml.CandidateModel * X: array-like of document vectors with shape [n_samples x n_features] * y: array-like of labels with shape [n_samples X n_labels] * folds: folds to use in cross-validation @@ -53,7 +53,7 @@ def evaluate_all_models(models, X, y, folds, scoring): Evaluate a number of models Arguments: - * models: a sequence of quantgov.estimator.Model objects + * models: a sequence of quantgov.ml.CandidateModel objects * X: array-like of document vectors with shape [n_samples x n_features] * y: array-like of labels with shape [n_samples X n_labels] * folds: folds to use in cross-validation @@ -104,10 +104,10 @@ def evaluate(modeldefs, trainers, labels, folds, scoring, results_file, Arguments: * **modeldefs**: Path to a python module containing a list of - `quantgov.estimator.CandidateModel` objects in a module-level + `quantgov.ml.CandidateModel` objects in a module-level variable named `models'. - * **trainers**: a `quantgov.estimator.Trainers` object - * **labels**: a `quantgov.estimator.Labels` object + * **trainers**: a `quantgov.ml.Trainers` object + * **labels**: a `quantgov.ml.Labels` object * **folds**: folds to use in cross-validation * **scoring**: scoring method to use * **results_file**: open file object to which results should be written diff --git a/quantgov/estimator/structures.py b/quantgov/ml/structures.py similarity index 53% rename from quantgov/estimator/structures.py rename to quantgov/ml/structures.py index 8ef59ea..7bd138a 100644 --- a/quantgov/estimator/structures.py +++ b/quantgov/ml/structures.py @@ -1,5 +1,5 @@ """ -quantgov.estimator.structures +quantgov.ml.structures Useful structures for evaluating and training estimators """ @@ -13,15 +13,28 @@ class _PersistanceMixin(object): object """ + @classmethod + def load(cls, path): + """ + Load a saved object at path `path` + """ + loaded = jl.load(path) + if not isinstance(loaded, cls): + raise ValueError( + 'Expected saved type {}, path {} contained saved type {}' + .format(cls, path, type(loaded)) + ) + return loaded + def save(self, path): """ - Use joblib to pickle the object. + Use joblib to save the object. Arguments: path: an open file object or string holding the path to where the object should be saved """ - jl.dump(self, path) + jl.dump(self, path, compress=True) class Labels( @@ -56,19 +69,45 @@ class Trainers( pass -class Model( - collections.namedtuple('Model', ['label_names', 'model']), +def is_multiclass(classes): + """ + Returns True if values in classes are anything but 1, 0, True, or False, + otherwise returns False. + """ + try: + return len(set(int(i) for i in classes) - {0, 1}) != 0 + except ValueError: + return True + + +class Estimator( + collections.namedtuple('Estimator', ['label_names', 'pipeline']), _PersistanceMixin ): """ - A Trained model + A Trained estimator Arguments: * label_names: sequence of names for each label the model estimates - * model: a trained sklearn-like model, implementing `.fit`, - `.fit_transform`, and `.predict` methods + * pipeline: a trained sklearn-like pipeline, implementing `.fit`, + `.fit_transform`, and `.predict` methods, where the X inputs are a + sequence of strings. """ - pass + + def __init__(self, *args, **kwargs): + super().__init__() + self.multilabel = len(self.label_names) > 1 + model = self.pipeline.steps[-1][1] + if self.multilabel: + try: + self.multiclass = any(is_multiclass(i) for i in model.classes_) + except (AttributeError, TypeError): + self.multiclass = any( + is_multiclass(i.classes_) + for i in model.steps[-1][-1].estimators_ + ) + else: + self.multiclass = is_multiclass(model.classes_) class CandidateModel( diff --git a/quantgov/estimator/training.py b/quantgov/ml/training.py similarity index 58% rename from quantgov/estimator/training.py rename to quantgov/ml/training.py index 3d405b6..3ec45a3 100644 --- a/quantgov/estimator/training.py +++ b/quantgov/ml/training.py @@ -1,6 +1,8 @@ import configparser -import quantgov.estimator +import sklearn.pipeline + +import quantgov.ml def _autoconvert(value): @@ -23,29 +25,38 @@ def get_model(modeldefs, configfile): config.optionxform = str config.read(configfile) models = {i.name: i for i in - quantgov.estimator.utils.load_models(modeldefs)} + quantgov.ml.utils.load_models(modeldefs)} model = models[config['Model']['name']].model model.set_params( **{i: _autoconvert(j) for i, j in config['Parameters'].items()}) return model -def train_and_save_model(modeldefs, configfile, trainers, labels, outfile): +def train_and_save_model( + modeldefs, + configfile, + vectorizer, + trainers, + labels, + outfile): """ Train and save model described in config file Arguments: * **modeldefs**: Path to a python module containing a list of - `quantgov.estimator.CandidateModel` objects in a module-level + `quantgov.ml.CandidateModel` objects in a module-level variable named `models'. * **configfile**: config file as produced by - `quantgov estimator evaluate` - * **trainers**: a `quantgov.estimator.Trainers` object - * **labels**: a `quantgov.estimator.Labels` object + `quantgov ml evaluate` + * **vectorizer**: an sklearn-compatible Vectorizer object + * **trainers**: a `quantgov.ml.Trainers` object + * **labels**: a `quantgov.ml.Labels` object * **outfile**: file to which model should be saved """ - model = get_model(modeldefs, configfile) - model.fit(trainers.vectors, labels.labels) - quantgov.estimator.Model(labels.label_names, model).save(outfile) + pipeline = sklearn.pipeline.Pipeline(( + ('vectorizer', vectorizer), + ('model', model.fit(trainers.vectors, labels.labels)), + )) + quantgov.ml.Estimator(labels.label_names, pipeline).save(outfile) diff --git a/quantgov/estimator/utils.py b/quantgov/ml/utils.py similarity index 88% rename from quantgov/estimator/utils.py rename to quantgov/ml/utils.py index f275455..1d88dc5 100644 --- a/quantgov/estimator/utils.py +++ b/quantgov/ml/utils.py @@ -10,7 +10,7 @@ def load_models(path): Arguments: * **path**: Path to a python module containing a list of - `quantgov.estimator.CandidateModel` objects in a module-level + `quantgov.ml.CandidateModel` objects in a module-level """ path = Path(path).resolve() try: diff --git a/quantgov/corpus/builtins.py b/quantgov/nlp.py similarity index 91% rename from quantgov/corpus/builtins.py rename to quantgov/nlp.py index 5a5e42c..8022fd8 100644 --- a/quantgov/corpus/builtins.py +++ b/quantgov/nlp.py @@ -1,12 +1,13 @@ """ -quantgov.corpora.builtins: Functions for analyzing a single Document +quantgov.nlp: Text-based analysis of documents """ import re import collections import math from decorator import decorator -import quantgov + +from . import utils try: import nltk.corpus @@ -22,10 +23,7 @@ if NLTK: try: nltk.corpus.wordnet.ensure_loaded() - nltk.corpus.stopwords.ensure_loaded() except LookupError: - nltk.download('stopwords') - nltk.corpus.stopwords.ensure_loaded() nltk.download('wordnet') nltk.corpus.wordnet.ensure_loaded() @@ -48,10 +46,10 @@ def check_textblob(func, *args, **kwargs): class WordCounter(): - cli = quantgov.utils.CLISpec( + cli = utils.CLISpec( help='Word Counter', arguments=[ - quantgov.utils.CLIArg( + utils.CLIArg( flags=('--word_pattern', '-wp'), kwargs={ 'help': 'regular expression defining a "word"', @@ -76,17 +74,17 @@ def process_document(doc, word_pattern): class OccurrenceCounter(): - cli = quantgov.utils.CLISpec( + cli = utils.CLISpec( help="Term Counter for Specific Words", arguments=[ - quantgov.utils.CLIArg( + utils.CLIArg( flags=('terms'), kwargs={ 'help': 'list of terms to be counted', 'nargs': '+' } ), - quantgov.utils.CLIArg( + utils.CLIArg( flags=('--total_label'), kwargs={ 'metavar': 'LABEL', @@ -96,7 +94,7 @@ class OccurrenceCounter(): ), } ), - quantgov.utils.CLIArg( + utils.CLIArg( flags=('--pattern'), kwargs={ 'help': 'pattern to use in identifying words', @@ -134,10 +132,10 @@ def process_document(doc, terms, pattern, total_label): class ShannonEntropy(): lemmas = {} - cli = quantgov.utils.CLISpec( + cli = utils.CLISpec( help='Shannon Entropy', arguments=[ - quantgov.utils.CLIArg( + utils.CLIArg( flags=('--word_pattern', '-wp'), kwargs={ 'help': 'regular expression defining a "word"', @@ -145,7 +143,7 @@ class ShannonEntropy(): 'default': re.compile(r'\b\w+\b') } ), - quantgov.utils.CLIArg( + utils.CLIArg( flags=('--stopwords', '-sw'), kwargs={ 'help': 'stopwords to ignore', @@ -155,7 +153,7 @@ class ShannonEntropy(): ) } ), - quantgov.utils.CLIArg( + utils.CLIArg( flags=('--precision'), kwargs={ 'help': 'decimal places to round', @@ -200,7 +198,7 @@ def lemmatize(word): class ConditionalCounter(): - cli = quantgov.utils.CLISpec( + cli = utils.CLISpec( help=('Count conditional words and phrases. Included terms are: ' ' "if", "but", "except", "provided", "when", "where", ' '"whenever", "unless", "notwithstanding", "in the event", ' @@ -228,10 +226,10 @@ def process_document(doc): class SentenceLength(): - cli = quantgov.utils.CLISpec( + cli = utils.CLISpec( help='Sentence Length', arguments=[ - quantgov.utils.CLIArg( + utils.CLIArg( flags=('--precision'), kwargs={ 'help': 'decimal places to round', @@ -266,17 +264,17 @@ def process_document(doc, precision): class SentimentAnalysis(): - cli = quantgov.utils.CLISpec( + cli = utils.CLISpec( help='Performs sentiment analysis on the text', arguments=[ - quantgov.utils.CLIArg( + utils.CLIArg( flags=('--backend'), kwargs={ 'help': 'which program to use for the analysis', 'default': 'textblob' } ), - quantgov.utils.CLIArg( + utils.CLIArg( flags=('--precision'), kwargs={ 'help': 'decimal places to round', diff --git a/quantgov/project/__init__.py b/quantgov/project/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/quantgov/project/builtins.py b/quantgov/project/builtins.py deleted file mode 100644 index e69de29..0000000 diff --git a/setup.py b/setup.py index eb1c880..672fc3f 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,6 @@ def find_version(*file_paths): 'requests', 'scikit-learn', 'scipy', - 'snakemake', ], extras_require={ 'testing': ['pytest-flake8'], diff --git a/tests/pseudo_corpus/driver.py b/tests/pseudo_corpus/driver.py index 45d8980..3b00174 100644 --- a/tests/pseudo_corpus/driver.py +++ b/tests/pseudo_corpus/driver.py @@ -2,7 +2,7 @@ from pathlib import Path -driver = quantgov.corpora.RecursiveDirectoryCorpusDriver( +driver = quantgov.corpus.RecursiveDirectoryCorpusDriver( directory=Path(__file__).parent.joinpath('data', 'clean'), index_labels=('file',) ) diff --git a/tests/pseudo_estimator/data/binary.qge b/tests/pseudo_estimator/data/binary.qge new file mode 100644 index 0000000..ccf9ffa Binary files /dev/null and b/tests/pseudo_estimator/data/binary.qge differ diff --git a/tests/pseudo_estimator/data/model.pickle b/tests/pseudo_estimator/data/model.pickle deleted file mode 100644 index 2ffaac6..0000000 Binary files a/tests/pseudo_estimator/data/model.pickle and /dev/null differ diff --git a/tests/pseudo_estimator/data/modelmulticlass.pickle b/tests/pseudo_estimator/data/modelmulticlass.pickle deleted file mode 100644 index 2071d94..0000000 Binary files a/tests/pseudo_estimator/data/modelmulticlass.pickle and /dev/null differ diff --git a/tests/pseudo_estimator/data/multiclass.qge b/tests/pseudo_estimator/data/multiclass.qge new file mode 100644 index 0000000..1f47c87 Binary files /dev/null and b/tests/pseudo_estimator/data/multiclass.qge differ diff --git a/tests/pseudo_estimator/data/vectorizer.pickle b/tests/pseudo_estimator/data/vectorizer.pickle deleted file mode 100644 index 0fdaee6..0000000 Binary files a/tests/pseudo_estimator/data/vectorizer.pickle and /dev/null differ diff --git a/tests/test_estimator.py b/tests/test_ml.py similarity index 68% rename from tests/test_estimator.py rename to tests/test_ml.py index 07876d5..042ddbb 100644 --- a/tests/test_estimator.py +++ b/tests/test_ml.py @@ -1,5 +1,5 @@ import pytest -import quantgov.estimator +import quantgov.ml import subprocess from pathlib import Path @@ -21,9 +21,8 @@ def check_output(cmd): def test_simple_estimator(): output = check_output( - ['quantgov', 'estimator', 'estimate', - str(PSEUDO_ESTIMATOR_PATH.joinpath('data', 'vectorizer.pickle')), - str(PSEUDO_ESTIMATOR_PATH.joinpath('data', 'model.pickle')), + ['quantgov', 'ml', 'estimate', + str(PSEUDO_ESTIMATOR_PATH.joinpath('data', 'binary.qge')), str(PSEUDO_CORPUS_PATH)] ) assert output == 'file,is_world\ncfr,False\nmoby,False\n' @@ -31,9 +30,8 @@ def test_simple_estimator(): def test_probability_estimator(): output = check_output( - ['quantgov', 'estimator', 'estimate', - str(PSEUDO_ESTIMATOR_PATH.joinpath('data', 'vectorizer.pickle')), - str(PSEUDO_ESTIMATOR_PATH.joinpath('data', 'model.pickle')), + ['quantgov', 'ml', 'estimate', + str(PSEUDO_ESTIMATOR_PATH.joinpath('data', 'binary.qge')), str(PSEUDO_CORPUS_PATH), '--probability'] ) assert output == ('file,is_world_prob\ncfr,0.0899\nmoby,0.0216\n') @@ -41,9 +39,8 @@ def test_probability_estimator(): def test_probability_estimator_6decimals(): output = check_output( - ['quantgov', 'estimator', 'estimate', - str(PSEUDO_ESTIMATOR_PATH.joinpath('data', 'vectorizer.pickle')), - str(PSEUDO_ESTIMATOR_PATH.joinpath('data', 'model.pickle')), + ['quantgov', 'ml', 'estimate', + str(PSEUDO_ESTIMATOR_PATH.joinpath('data', 'binary.qge')), str(PSEUDO_CORPUS_PATH), '--probability', '--precision', '6'] ) assert output == ('file,is_world_prob\ncfr,0.089898\nmoby,0.02162\n') @@ -51,9 +48,8 @@ def test_probability_estimator_6decimals(): def test_multiclass_probability_estimator(): output = check_output( - ['quantgov', 'estimator', 'estimate', - str(PSEUDO_ESTIMATOR_PATH.joinpath('data', 'vectorizer.pickle')), - str(PSEUDO_ESTIMATOR_PATH.joinpath('data', 'modelmulticlass.pickle')), + ['quantgov', 'ml', 'estimate', + str(PSEUDO_ESTIMATOR_PATH.joinpath('data', 'multiclass.qge')), str(PSEUDO_CORPUS_PATH), '--probability'] ) assert output == ('file,class,probability\n' diff --git a/tests/test_corpora.py b/tests/test_nlp.py similarity index 82% rename from tests/test_corpora.py rename to tests/test_nlp.py index 488fca9..db14cf5 100644 --- a/tests/test_corpora.py +++ b/tests/test_nlp.py @@ -35,7 +35,7 @@ def build_index_corpus(directory): with index_path.open('w', encoding='utf-8') as outf: outf.write('letter,number,path\n') outf.write('\n'.join(','.join(row) for row in rows)) - return quantgov.corpora.IndexDriver(str(index_path)) + return quantgov.corpus.IndexDriver(str(index_path)) def build_s3_corpus(directory): @@ -49,8 +49,8 @@ def build_s3_corpus(directory): with index_path.open('w', encoding='utf-8') as outf: outf.write('letter,number,path\n') outf.write('\n'.join(','.join(row) for row in rows)) - return quantgov.corpora.S3Driver(str(index_path), - bucket='quantgov-databanks') + return quantgov.corpus.S3Driver(str(index_path), + bucket='quantgov-databanks') BUILDERS = { @@ -107,14 +107,14 @@ def check_output(cmd): def test_wordcount(): output = check_output( - ['quantgov', 'corpus', 'count_words', str(PSEUDO_CORPUS_PATH)], + ['quantgov', 'nlp', 'count_words', str(PSEUDO_CORPUS_PATH)], ) assert output == 'file,words\ncfr,349153\nmoby,216645\n' def test_wordcount_pattern(): output = check_output( - ['quantgov', 'corpus', 'count_words', str(PSEUDO_CORPUS_PATH), + ['quantgov', 'nlp', 'count_words', str(PSEUDO_CORPUS_PATH), '--word_pattern', '\S+'] ) assert output == 'file,words\ncfr,333237\nmoby,210130\n' @@ -122,7 +122,7 @@ def test_wordcount_pattern(): def test_termcount(): output = check_output( - ['quantgov', 'corpus', 'count_occurrences', str(PSEUDO_CORPUS_PATH), + ['quantgov', 'nlp', 'count_occurrences', str(PSEUDO_CORPUS_PATH), 'shall'], ) assert output == 'file,shall\ncfr,1946\nmoby,94\n' @@ -130,7 +130,7 @@ def test_termcount(): def test_termcount_multiple(): output = check_output( - ['quantgov', 'corpus', 'count_occurrences', str(PSEUDO_CORPUS_PATH), + ['quantgov', 'nlp', 'count_occurrences', str(PSEUDO_CORPUS_PATH), 'shall', 'must', 'may not'], ) assert output == ('file,shall,must,may not\n' @@ -139,7 +139,7 @@ def test_termcount_multiple(): def test_termcount_multiple_with_label(): output = check_output( - ['quantgov', 'corpus', 'count_occurrences', str(PSEUDO_CORPUS_PATH), + ['quantgov', 'nlp', 'count_occurrences', str(PSEUDO_CORPUS_PATH), 'shall', 'must', 'may not', '--total_label', 'allofthem'], ) assert output == ('file,shall,must,may not,allofthem\n' @@ -148,14 +148,14 @@ def test_termcount_multiple_with_label(): def test_shannon_entropy(): output = check_output( - ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH)], + ['quantgov', 'nlp', 'shannon_entropy', str(PSEUDO_CORPUS_PATH)], ) assert output == 'file,shannon_entropy\ncfr,10.71\nmoby,11.81\n' def test_shannon_entropy_no_stopwords(): output = check_output( - ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH), + ['quantgov', 'nlp', 'shannon_entropy', str(PSEUDO_CORPUS_PATH), '--stopwords', 'None'], ) assert output == 'file,shannon_entropy\ncfr,9.52\nmoby,10.03\n' @@ -163,7 +163,7 @@ def test_shannon_entropy_no_stopwords(): def test_shannon_entropy_4decimals(): output = check_output( - ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH), + ['quantgov', 'nlp', 'shannon_entropy', str(PSEUDO_CORPUS_PATH), '--precision', '4'], ) assert output == 'file,shannon_entropy\ncfr,10.7127\nmoby,11.813\n' @@ -171,21 +171,21 @@ def test_shannon_entropy_4decimals(): def test_conditionalcount(): output = check_output( - ['quantgov', 'corpus', 'count_conditionals', str(PSEUDO_CORPUS_PATH)], + ['quantgov', 'nlp', 'count_conditionals', str(PSEUDO_CORPUS_PATH)], ) assert output == 'file,conditionals\ncfr,2132\nmoby,2374\n' def test_sentencelength(): output = check_output( - ['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH)], + ['quantgov', 'nlp', 'sentence_length', str(PSEUDO_CORPUS_PATH)], ) assert output == 'file,sentence_length\ncfr,18.68\nmoby,25.09\n' def test_sentencelength_4decimals(): output = check_output( - ['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH), + ['quantgov', 'nlp', 'sentence_length', str(PSEUDO_CORPUS_PATH), '--precision', '4'], ) assert output == 'file,sentence_length\ncfr,18.6827\nmoby,25.0936\n' @@ -193,7 +193,7 @@ def test_sentencelength_4decimals(): def test_sentiment_analysis(): output = check_output( - ['quantgov', 'corpus', 'sentiment_analysis', str(PSEUDO_CORPUS_PATH)], + ['quantgov', 'nlp', 'sentiment_analysis', str(PSEUDO_CORPUS_PATH)], ) assert output == ('file,sentiment_polarity,sentiment_subjectivity' '\ncfr,0.01,0.42\nmoby,0.08,0.48\n') @@ -201,7 +201,7 @@ def test_sentiment_analysis(): def test_sentiment_analysis_4decimals(): output = check_output( - ['quantgov', 'corpus', 'sentiment_analysis', str(PSEUDO_CORPUS_PATH), + ['quantgov', 'nlp', 'sentiment_analysis', str(PSEUDO_CORPUS_PATH), '--precision', '4'], ) assert output == ('file,sentiment_polarity,sentiment_subjectivity'