diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 000000000..78c0b0b3a --- /dev/null +++ b/.coveragerc @@ -0,0 +1,16 @@ +# .converagtrc to control coverage.py +# We usually use this as "nose2 --with-coverage" +# see https://coverage.readthedocs.io/en/coverage-4.2/config.html + +[run] + +omit = + tests/legacy_comparison/* + wsgi.py + populate_test_database.py + + +[report] + +# Exit nose2 with failed if coverage under this percent +fail_under = 80 \ No newline at end of file diff --git a/.gitignore b/.gitignore index ea14e0647..e7701474f 100644 --- a/.gitignore +++ b/.gitignore @@ -66,7 +66,7 @@ instance/ .scrapy # Sphinx documentation -docs/_build/ +docs/source/_build/ # PyBuilder target/ @@ -113,6 +113,7 @@ mypy_test_errors.txt arxiv-browse.iml .vscode *~ +TAGS # DB connection, etc private_vars.sh diff --git a/.travis.yml b/.travis.yml index 9385b7dfd..c9ed5ccba 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ python: script: - pip install pipenv - pipenv sync --dev - - pipenv run nose2 --with-coverage + - pipenv run nose2 --with-coverage --coverage-config .coveragerc - tests/lint.sh - tests/docstyle.sh after_success: diff --git a/Pipfile b/Pipfile index ca954c137..458f9a556 100644 --- a/Pipfile +++ b/Pipfile @@ -4,30 +4,33 @@ verify_ssl = true name = "pypi" [packages] -arxiv-base = "==0.12.1rc2" dataclasses = "*" python-dateutil = "*" -flask = "==0.12.*" Flask-API = "*" Flask-SQLAlchemy = "*" -"jinja2" = "==2.10" MarkupSafe = "*" SQLAlchemy = "*" pytz = "*" -mysqlclient = "*" +mysqlclient = "==1.4.1" "mmh3" = "*" aiohttp = "*" +flask = "==1.0.2" +arxiv-base = "==0.15.5" +validators = "*" +mypy-extensions = "*" +flask-wtf = "*" +arxiv-auth = "==0.3.1" +mypy = "*" +jinja2 = "==2.10.1" [dev-packages] pylama = "*" -mypy = ">=0.630" -mypy-extensions = "*" "nose2" = "*" sqlacodegen = "*" Flask-Testing = "*" pycodestyle = "*" pydocstyle = "*" -pylint = "*" +pylint = "==2.3.0" pytest = "*" pytest-html = "*" "ansi2html" = "*" @@ -38,6 +41,8 @@ pytest-easyread = "*" weighted-levenshtein = "*" coverage = "*" coveralls = "*" +sphinx = "*" +sphinx-autodoc-typehints = "*" [requires] python_version = "3.6" diff --git a/Pipfile.lock b/Pipfile.lock index f3f61ec4b..72060f55a 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "55cdcb1d2ddc9401cedfa6eff75f5f36404a944c4d4b0c9565980378520571a5" + "sha256": "6f3c2774f9cc150288cc1199410a4e0fcf840fb8eeadecf9dcda001172b4d4e6" }, "pipfile-spec": 6, "requires": { @@ -44,12 +44,19 @@ "index": "pypi", "version": "==3.5.4" }, + "arxiv-auth": { + "hashes": [ + "sha256:c65b73aadae3c2a7267838fadf0cc0c07a83a8f53b87b57f5d401625586e212b" + ], + "index": "pypi", + "version": "==0.3.1" + }, "arxiv-base": { "hashes": [ - "sha256:9896b4f54d4a5e20c5f7ab7e8c65aa022781f8eba149b9730896fe22a44e0535" + "sha256:1fedc705e0a3601d22665cd36d5a83f2dc855a40c8de89858d23914b1421b90e" ], "index": "pypi", - "version": "==0.12.1rc2" + "version": "==0.15.5" }, "async-timeout": { "hashes": [ @@ -60,24 +67,37 @@ }, "attrs": { "hashes": [ - "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69", - "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb" + "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", + "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" + ], + "version": "==19.1.0" + }, + "backports-datetime-fromisoformat": { + "hashes": [ + "sha256:9577a2a9486cd7383a5f58b23bb8e81cf0821dbbc0eb7c87d3fa198c1df40f5c" + ], + "version": "==1.0.0" + }, + "bleach": { + "hashes": [ + "sha256:213336e49e102af26d9cde77dd2d0397afabc5a6bf2fed985dc35b5d1e285a16", + "sha256:3fdf7f77adcf649c9911387df51254b813185e32b2c6619f690b593a617e19fa" ], - "version": "==18.2.0" + "version": "==3.1.0" }, "boto3": { "hashes": [ - "sha256:d494043f4fa833c14ca553dd27dc9fd714390215783ff88d2b5597dbc801e779", - "sha256:ff4f0d48b7f6e7fcc3503597e1225c8413d83a43afbf3940f9fcb5a019f3c327" + "sha256:35e23af3fcb0d38def987e1e4fc0652dd654b3eb0e4c9c8b2869cdaf289fbfa7", + "sha256:603572f3824be5efc683b4c2327e2ef871d52158790b16ff754cb76b356ec3b5" ], - "version": "==1.9.90" + "version": "==1.9.132" }, "botocore": { "hashes": [ - "sha256:5a3dd9fe7cc5a5dab62016108180c58444e9025a3edd7b4b76b3573db1fcebe4", - "sha256:c6d4ffcf6c152b3224f16d59f92d938a7375c8b185bb08165a001e9bc59f95cc" + "sha256:35c46cf79cbd7fa9b25bee74972e77756e6a584beab10f7ff52abba308e5149e", + "sha256:f7200836d7dd77feae3af9c2e9d7769f69fb8d0ef760da635f3cb5c2ddcb8ade" ], - "version": "==1.12.90" + "version": "==1.12.132" }, "chardet": { "hashes": [ @@ -101,6 +121,13 @@ "index": "pypi", "version": "==0.6" }, + "decorator": { + "hashes": [ + "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de", + "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6" + ], + "version": "==4.4.0" + }, "docutils": { "hashes": [ "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6", @@ -111,11 +138,11 @@ }, "flask": { "hashes": [ - "sha256:2ea22336f6d388b4b242bc3abf8a01244a8aa3e236e7407469ef78c16ba355dd", - "sha256:6c02dbaa5a9ef790d8219bdced392e2d549c10cd5a5ba4b6aa65126b2271af29" + "sha256:2271c0070dbcb5275fad4a82e29f23ab92682dc45f9dfbc22c02ba9b9322ce48", + "sha256:a080b744b7e345ccfcbc77954861cb05b3c63786e93f2b3875e0913d44b43f05" ], "index": "pypi", - "version": "==0.12.4" + "version": "==1.0.2" }, "flask-api": { "hashes": [ @@ -133,6 +160,14 @@ "index": "pypi", "version": "==2.3.2" }, + "flask-wtf": { + "hashes": [ + "sha256:5d14d55cfd35f613d99ee7cba0fc3fbbe63ba02f544d349158c14ca15561cc36", + "sha256:d9a9e366b32dcbb98ef17228e76be15702cd2600675668bca23f63a7947fd5ac" + ], + "index": "pypi", + "version": "==0.14.2" + }, "idna": { "hashes": [ "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", @@ -156,59 +191,59 @@ }, "jinja2": { "hashes": [ - "sha256:74c935a1b8bb9a3947c50a54766a969d4846290e1e788ea44c1392163723c3bd", - "sha256:f84be1bb0040caca4cea721fcbbbbd61f9be9464ca236387158b0feea01914a4" + "sha256:065c4f02ebe7f7cf559e49ee5a95fb800a9e4528727aec6f24402a5374c65013", + "sha256:14dd6caf1527abb21f08f86c784eac40853ba93edb79552aa1e4b8aef1b61c7b" ], "index": "pypi", - "version": "==2.10" + "version": "==2.10.1" }, "jmespath": { "hashes": [ - "sha256:6a81d4c9aa62caf061cb517b4d9ad1dd300374cd4706997aff9cd6aedd61fc64", - "sha256:f11b4461f425740a1d908e9a3f7365c3d2e569f6ca68a2ff8bc5bcd9676edd63" + "sha256:3720a4b1bd659dd2eecad0666459b9788813e032b83e7ba58578e48254e0a0e6", + "sha256:bde2aef6f44302dfb30320115b17d030798de8c4110e28d5cf6cf91a7a31074c" ], - "version": "==0.9.3" + "version": "==0.9.4" }, "jsonschema": { "hashes": [ - "sha256:000e68abd33c972a5248544925a0cae7d1125f9bf6c58280d37546b946769a08", - "sha256:6ff5f3180870836cae40f06fa10419f557208175f13ad7bc26caa77beb1f6e02" + "sha256:0c0a81564f181de3212efa2d17de1910f8732fa1b71c42266d983cd74304e20d", + "sha256:a5f6559964a3851f59040d3b961de5e68e70971afb88ba519d27e6a039efff1a" ], - "version": "==2.6.0" + "version": "==3.0.1" }, "markupsafe": { "hashes": [ - "sha256:048ef924c1623740e70204aa7143ec592504045ae4429b59c30054cb31e3c432", - "sha256:130f844e7f5bdd8e9f3f42e7102ef1d49b2e6fdf0d7526df3f87281a532d8c8b", - "sha256:19f637c2ac5ae9da8bfd98cef74d64b7e1bb8a63038a3505cd182c3fac5eb4d9", - "sha256:1b8a7a87ad1b92bd887568ce54b23565f3fd7018c4180136e1cf412b405a47af", - "sha256:1c25694ca680b6919de53a4bb3bdd0602beafc63ff001fea2f2fc16ec3a11834", - "sha256:1f19ef5d3908110e1e891deefb5586aae1b49a7440db952454b4e281b41620cd", - "sha256:1fa6058938190ebe8290e5cae6c351e14e7bb44505c4a7624555ce57fbbeba0d", - "sha256:31cbb1359e8c25f9f48e156e59e2eaad51cd5242c05ed18a8de6dbe85184e4b7", - "sha256:3e835d8841ae7863f64e40e19477f7eb398674da6a47f09871673742531e6f4b", - "sha256:4e97332c9ce444b0c2c38dd22ddc61c743eb208d916e4265a2a3b575bdccb1d3", - "sha256:525396ee324ee2da82919f2ee9c9e73b012f23e7640131dd1b53a90206a0f09c", - "sha256:52b07fbc32032c21ad4ab060fec137b76eb804c4b9a1c7c7dc562549306afad2", - "sha256:52ccb45e77a1085ec5461cde794e1aa037df79f473cbc69b974e73940655c8d7", - "sha256:5c3fbebd7de20ce93103cb3183b47671f2885307df4a17a0ad56a1dd51273d36", - "sha256:5e5851969aea17660e55f6a3be00037a25b96a9b44d2083651812c99d53b14d1", - "sha256:5edfa27b2d3eefa2210fb2f5d539fbed81722b49f083b2c6566455eb7422fd7e", - "sha256:7d263e5770efddf465a9e31b78362d84d015cc894ca2c131901a4445eaa61ee1", - "sha256:83381342bfc22b3c8c06f2dd93a505413888694302de25add756254beee8449c", - "sha256:857eebb2c1dc60e4219ec8e98dfa19553dae33608237e107db9c6078b1167856", - "sha256:98e439297f78fca3a6169fd330fbe88d78b3bb72f967ad9961bcac0d7fdd1550", - "sha256:bf54103892a83c64db58125b3f2a43df6d2cb2d28889f14c78519394feb41492", - "sha256:d9ac82be533394d341b41d78aca7ed0e0f4ba5a2231602e2f05aa87f25c51672", - "sha256:e982fe07ede9fada6ff6705af70514a52beb1b2c3d25d4e873e82114cf3c5401", - "sha256:edce2ea7f3dfc981c4ddc97add8a61381d9642dc3273737e756517cc03e84dd6", - "sha256:efdc45ef1afc238db84cb4963aa689c0408912a0239b0721cb172b4016eb31d6", - "sha256:f137c02498f8b935892d5c0172560d7ab54bc45039de8805075e19079c639a9c", - "sha256:f82e347a72f955b7017a39708a3667f106e6ad4d10b25f237396a7115d8ed5fd", - "sha256:fb7c206e01ad85ce57feeaaa0bf784b97fa3cad0d4a5737bc5295785f5c613a1" + "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", + "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", + "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", + "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", + "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", + "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", + "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", + "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", + "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", + "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", + "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", + "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905", + "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735", + "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d", + "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e", + "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d", + "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c", + "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21", + "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2", + "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5", + "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b", + "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", + "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", + "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7" ], "index": "pypi", - "version": "==1.1.0" + "version": "==1.1.1" }, "mmh3": { "hashes": [ @@ -251,14 +286,59 @@ ], "version": "==4.5.2" }, + "mypy": { + "hashes": [ + "sha256:2afe51527b1f6cdc4a5f34fc90473109b22bf7f21086ba3e9451857cf11489e6", + "sha256:56a16df3e0abb145d8accd5dbb70eba6c4bd26e2f89042b491faa78c9635d1e2", + "sha256:5764f10d27b2e93c84f70af5778941b8f4aa1379b2430f85c827e0f5464e8714", + "sha256:5bbc86374f04a3aa817622f98e40375ccb28c4836f36b66706cf3c6ccce86eda", + "sha256:6a9343089f6377e71e20ca734cd8e7ac25d36478a9df580efabfe9059819bf82", + "sha256:6c9851bc4a23dc1d854d3f5dfd5f20a016f8da86bcdbb42687879bb5f86434b0", + "sha256:b8e85956af3fcf043d6f87c91cbe8705073fc67029ba6e22d3468bfee42c4823", + "sha256:b9a0af8fae490306bc112229000aa0c2ccc837b49d29a5c42e088c132a2334dd", + "sha256:bbf643528e2a55df2c1587008d6e3bda5c0445f1240dfa85129af22ae16d7a9a", + "sha256:c46ab3438bd21511db0f2c612d89d8344154c0c9494afc7fbc932de514cf8d15", + "sha256:f7a83d6bd805855ef83ec605eb01ab4fa42bcef254b13631e451cbb44914a9b0" + ], + "index": "pypi", + "version": "==0.701" + }, + "mypy-extensions": { + "hashes": [ + "sha256:37e0e956f41369209a3d5f34580150bcacfabaa57b33a15c0b25f4b5725e0812", + "sha256:b16cabe759f55e3409a7d231ebd2841378fb0c27a5d1994719e340e4f429ac3e" + ], + "index": "pypi", + "version": "==0.4.1" + }, "mysqlclient": { "hashes": [ - "sha256:041c79d474cd0b4980f1175f1ff24d2796d6e1f1e400583b40d21ed0d5a4f279", - "sha256:b95edaa41d6cc47deecabcdcbb5ab437ad9ae6d8955f5cf10d1847b37e66ef5e", - "sha256:cd07e321f1f692ecd67e8291ffbacd61c7b42a6cedc84d40971fbddbbce9b79e" + "sha256:6883a4dd98903bad375c859ead1a480e1245ea3a8d9b038ea2c091c1865ba673", + "sha256:a62220410e26ce2d2ff94dd0138c3ecfb91db634464a9afb4c8e6b50f0a67e00", + "sha256:e1b9f3a8928ddb4985ca3e3c9f2aa81b19e831bbf6fabf5681ee356738dbbbb2" ], "index": "pypi", - "version": "==1.4.2" + "version": "==1.4.1" + }, + "pycountry": { + "hashes": [ + "sha256:104a8ca94c700898c42a0172da2eab5a5675c49637b729a11db9e1dac2d983cd", + "sha256:8ec4020b2b15cd410893d573820d42ee12fe50365332e58c0975c953b60a16de" + ], + "version": "==18.12.8" + }, + "pyjwt": { + "hashes": [ + "sha256:5c6eca3c2940464d106b99ba83b00c6add741c9becaec087fb7ccdefea71350e", + "sha256:8d59a976fb773f3e6a39c85636357c4f0e242707394cadadd9814f5cbaa20e96" + ], + "version": "==1.7.1" + }, + "pyrsistent": { + "hashes": [ + "sha256:3ca82748918eb65e2d89f222b702277099aca77e34843c5eb9d52451173970e2" + ], + "version": "==0.14.11" }, "python-dateutil": { "hashes": [ @@ -270,11 +350,24 @@ }, "pytz": { "hashes": [ - "sha256:32b0891edff07e28efe91284ed9c31e123d84bea3fd98e1f72be2508f43ef8d9", - "sha256:d5f05e487007e29e03409f9398d074e158d920d36eb82eaf66fb1136b0c5374c" + "sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda", + "sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141" ], "index": "pypi", - "version": "==2018.9" + "version": "==2019.1" + }, + "redis": { + "hashes": [ + "sha256:8a1900a9f2a0a44ecf6e8b5eb3e967a9909dfed219ad66df094f27f7d6f330fb", + "sha256:a22ca993cea2962dbb588f9f30d0015ac4afcc45bee27d3978c0dbe9e97c6c0f" + ], + "version": "==2.10.6" + }, + "redis-py-cluster": { + "hashes": [ + "sha256:7db54b1de60bd34da3806676b112f07fc9afae556d8260ac02c3335d574ee42c" + ], + "version": "==1.3.6" }, "s3transfer": { "hashes": [ @@ -292,10 +385,27 @@ }, "sqlalchemy": { "hashes": [ - "sha256:52a42dbf02d0562d6e90e7af59f177f1cc027e72833cc29c3a821eefa009c71d" + "sha256:91c54ca8345008fceaec987e10924bf07dcab36c442925357e5a467b36a38319" ], "index": "pypi", - "version": "==1.2.17" + "version": "==1.3.3" + }, + "typed-ast": { + "hashes": [ + "sha256:04894d268ba6eab7e093d43107869ad49e7b5ef40d1a94243ea49b352061b200", + "sha256:16616ece19daddc586e499a3d2f560302c11f122b9c692bc216e821ae32aa0d0", + "sha256:2af80a373af123d0b9f44941a46df67ef0ff7a60f95872412a145f4500a7fc99", + "sha256:2ea99c029ebd4b5a308d915cc7fb95b8e1201d60b065450d5d26deb65d3f2bc1", + "sha256:56b6978798502ef66625a2e0f80cf923da64e328da8bbe16c1ff928c70c873de", + "sha256:644ee788222d81555af543b70a1098f2025db38eaa99226f3a75a6854924d4db", + "sha256:64cf762049fc4775efe6b27161467e76d0ba145862802a65eefc8879086fc6f8", + "sha256:68c362848d9fb71d3c3e5f43c09974a0ae319144634e7a47db62f0f2a54a7fa7", + "sha256:6c1f3c6f6635e611d58e467bf4371883568f0de9ccc4606f17048142dec14a1f", + "sha256:b213d4a02eec4ddf622f4d2fbc539f062af3788d1f332f028a2e19c42da53f15", + "sha256:c9d414512eaa417aadae7758bc118868cd2396b0e6138c1dd4fda96679c079d3", + "sha256:fb96a6e2c11059ecf84e6741a319f93f683e440e341d4489c9b161eca251cf2a" + ], + "version": "==1.3.4" }, "typing-extensions": { "hashes": [ @@ -308,24 +418,45 @@ }, "urllib3": { "hashes": [ - "sha256:61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", - "sha256:de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22" + "sha256:4c291ca23bbb55c76518905869ef34bdd5f0e46af7afe6861e8375643ffee1a0", + "sha256:9a247273df709c4fedb38c711e44292304f73f39ab01beda9f6b9fc375669ac3" ], "markers": "python_version >= '3.4'", - "version": "==1.24.1" + "version": "==1.24.2" }, "uwsgi": { "hashes": [ - "sha256:d2318235c74665a60021a4fc7770e9c2756f9fc07de7b8c22805efe85b5ab277" + "sha256:4972ac538800fb2d421027f49b4a1869b66048839507ccf0aa2fda792d99f583" + ], + "version": "==2.0.18" + }, + "validators": { + "hashes": [ + "sha256:df3dda070965519283bae72249a36927ee3ea9c206f9ee6f234a71cf19b36136" ], - "version": "==2.0.17.1" + "index": "pypi", + "version": "==0.12.5" + }, + "webencodings": { + "hashes": [ + "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", + "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923" + ], + "version": "==0.5.1" }, "werkzeug": { "hashes": [ - "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c", - "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b" + "sha256:0a73e8bb2ff2feecfc5d56e6f458f5b99290ef34f565ffb2665801ff7de6af7a", + "sha256:7fad9770a8778f9576693f0cc29c7dcc36964df916b83734f4431c0e612a7fbc" + ], + "version": "==0.15.2" + }, + "wtforms": { + "hashes": [ + "sha256:0cdbac3e7f6878086c334aa25dc5a33869a3954e9d1e015130d65a69309b3b61", + "sha256:e3ee092c827582c50877cdbd49e9ce6d2c5c1f6561f849b3b068c1b8029626f1" ], - "version": "==0.14.1" + "version": "==2.2.1" }, "yarl": { "hashes": [ @@ -345,6 +476,13 @@ } }, "develop": { + "alabaster": { + "hashes": [ + "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359", + "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02" + ], + "version": "==0.7.12" + }, "ansi2html": { "hashes": [ "sha256:96ae85ae7b26b7da674d87de2870ba4d1964bca733ae4614587080b6358c3ba9" @@ -354,10 +492,10 @@ }, "astroid": { "hashes": [ - "sha256:35b032003d6a863f5dcd7ec11abd5cd5893428beaa31ab164982403bcb311f22", - "sha256:6a5d668d7dc69110de01cdf7aeec69a679ef486862a0850cc0fd5571505b6b7e" + "sha256:6560e1e1749f68c64a4b5dee4e091fce798d2f0d84ebe638cf0e0585a343acf4", + "sha256:b65db1bbaac9f9f4d190199bb8680af6f6f84fd3769a5ea883df8a91fe68b4c4" ], - "version": "==2.1.0" + "version": "==2.2.5" }, "atomicwrites": { "hashes": [ @@ -368,10 +506,17 @@ }, "attrs": { "hashes": [ - "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69", - "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb" + "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", + "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" ], - "version": "==18.2.0" + "version": "==19.1.0" + }, + "babel": { + "hashes": [ + "sha256:6778d85147d5d85345c14a26aada5e478ab04e39b078b0745ee6870c2b5cf669", + "sha256:8cba50f48c529ca3fa18cf81fa9403be176d374ac4d60738b839122dfaaa3d23" + ], + "version": "==2.6.0" }, "beautifulsoup4": { "hashes": [ @@ -384,10 +529,10 @@ }, "certifi": { "hashes": [ - "sha256:47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", - "sha256:993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033" + "sha256:59b7658e26ca9c7339e00f8f4636cdfe59d34fa37b9b04f6f9e9926b3cece1a5", + "sha256:b26104d6835d1f5e49452a26eb2ff87fe7090b89dfcaee5ea2212697e1e1d7ae" ], - "version": "==2018.11.29" + "version": "==2019.3.9" }, "chardet": { "hashes": [ @@ -405,48 +550,48 @@ }, "coverage": { "hashes": [ - "sha256:09e47c529ff77bf042ecfe858fb55c3e3eb97aac2c87f0349ab5a7efd6b3939f", - "sha256:0a1f9b0eb3aa15c990c328535655847b3420231af299386cfe5efc98f9c250fe", - "sha256:0cc941b37b8c2ececfed341444a456912e740ecf515d560de58b9a76562d966d", - "sha256:10e8af18d1315de936d67775d3a814cc81d0747a1a0312d84e27ae5610e313b0", - "sha256:1b4276550b86caa60606bd3572b52769860a81a70754a54acc8ba789ce74d607", - "sha256:1e8a2627c48266c7b813975335cfdea58c706fe36f607c97d9392e61502dc79d", - "sha256:2b224052bfd801beb7478b03e8a66f3f25ea56ea488922e98903914ac9ac930b", - "sha256:447c450a093766744ab53bf1e7063ec82866f27bcb4f4c907da25ad293bba7e3", - "sha256:46101fc20c6f6568561cdd15a54018bb42980954b79aa46da8ae6f008066a30e", - "sha256:4710dc676bb4b779c4361b54eb308bc84d64a2fa3d78e5f7228921eccce5d815", - "sha256:510986f9a280cd05189b42eee2b69fecdf5bf9651d4cd315ea21d24a964a3c36", - "sha256:5535dda5739257effef56e49a1c51c71f1d37a6e5607bb25a5eee507c59580d1", - "sha256:5a7524042014642b39b1fcae85fb37556c200e64ec90824ae9ecf7b667ccfc14", - "sha256:5f55028169ef85e1fa8e4b8b1b91c0b3b0fa3297c4fb22990d46ff01d22c2d6c", - "sha256:6694d5573e7790a0e8d3d177d7a416ca5f5c150742ee703f3c18df76260de794", - "sha256:6831e1ac20ac52634da606b658b0b2712d26984999c9d93f0c6e59fe62ca741b", - "sha256:77f0d9fa5e10d03aa4528436e33423bfa3718b86c646615f04616294c935f840", - "sha256:828ad813c7cdc2e71dcf141912c685bfe4b548c0e6d9540db6418b807c345ddd", - "sha256:85a06c61598b14b015d4df233d249cd5abfa61084ef5b9f64a48e997fd829a82", - "sha256:8cb4febad0f0b26c6f62e1628f2053954ad2c555d67660f28dfb1b0496711952", - "sha256:a5c58664b23b248b16b96253880b2868fb34358911400a7ba39d7f6399935389", - "sha256:aaa0f296e503cda4bc07566f592cd7a28779d433f3a23c48082af425d6d5a78f", - "sha256:ab235d9fe64833f12d1334d29b558aacedfbca2356dfb9691f2d0d38a8a7bfb4", - "sha256:b3b0c8f660fae65eac74fbf003f3103769b90012ae7a460863010539bb7a80da", - "sha256:bab8e6d510d2ea0f1d14f12642e3f35cefa47a9b2e4c7cea1852b52bc9c49647", - "sha256:c45297bbdbc8bb79b02cf41417d63352b70bcb76f1bbb1ee7d47b3e89e42f95d", - "sha256:d19bca47c8a01b92640c614a9147b081a1974f69168ecd494687c827109e8f42", - "sha256:d64b4340a0c488a9e79b66ec9f9d77d02b99b772c8b8afd46c1294c1d39ca478", - "sha256:da969da069a82bbb5300b59161d8d7c8d423bc4ccd3b410a9b4d8932aeefc14b", - "sha256:ed02c7539705696ecb7dc9d476d861f3904a8d2b7e894bd418994920935d36bb", - "sha256:ee5b8abc35b549012e03a7b1e86c09491457dba6c94112a2482b18589cc2bdb9" + "sha256:3684fabf6b87a369017756b551cef29e505cb155ddb892a7a29277b978da88b9", + "sha256:39e088da9b284f1bd17c750ac672103779f7954ce6125fd4382134ac8d152d74", + "sha256:3c205bc11cc4fcc57b761c2da73b9b72a59f8d5ca89979afb0c1c6f9e53c7390", + "sha256:465ce53a8c0f3a7950dfb836438442f833cf6663d407f37d8c52fe7b6e56d7e8", + "sha256:48020e343fc40f72a442c8a1334284620f81295256a6b6ca6d8aa1350c763bbe", + "sha256:5296fc86ab612ec12394565c500b412a43b328b3907c0d14358950d06fd83baf", + "sha256:5f61bed2f7d9b6a9ab935150a6b23d7f84b8055524e7be7715b6513f3328138e", + "sha256:68a43a9f9f83693ce0414d17e019daee7ab3f7113a70c79a3dd4c2f704e4d741", + "sha256:6b8033d47fe22506856fe450470ccb1d8ba1ffb8463494a15cfc96392a288c09", + "sha256:7ad7536066b28863e5835e8cfeaa794b7fe352d99a8cded9f43d1161be8e9fbd", + "sha256:7bacb89ccf4bedb30b277e96e4cc68cd1369ca6841bde7b005191b54d3dd1034", + "sha256:839dc7c36501254e14331bcb98b27002aa415e4af7ea039d9009409b9d2d5420", + "sha256:8f9a95b66969cdea53ec992ecea5406c5bd99c9221f539bca1e8406b200ae98c", + "sha256:932c03d2d565f75961ba1d3cec41ddde00e162c5b46d03f7423edcb807734eab", + "sha256:988529edadc49039d205e0aa6ce049c5ccda4acb2d6c3c5c550c17e8c02c05ba", + "sha256:998d7e73548fe395eeb294495a04d38942edb66d1fa61eb70418871bc621227e", + "sha256:9de60893fb447d1e797f6bf08fdf0dbcda0c1e34c1b06c92bd3a363c0ea8c609", + "sha256:9e80d45d0c7fcee54e22771db7f1b0b126fb4a6c0a2e5afa72f66827207ff2f2", + "sha256:a545a3dfe5082dc8e8c3eb7f8a2cf4f2870902ff1860bd99b6198cfd1f9d1f49", + "sha256:a5d8f29e5ec661143621a8f4de51adfb300d7a476224156a39a392254f70687b", + "sha256:aca06bfba4759bbdb09bf52ebb15ae20268ee1f6747417837926fae990ebc41d", + "sha256:bb23b7a6fd666e551a3094ab896a57809e010059540ad20acbeec03a154224ce", + "sha256:bfd1d0ae7e292105f29d7deaa9d8f2916ed8553ab9d5f39ec65bcf5deadff3f9", + "sha256:c62ca0a38958f541a73cf86acdab020c2091631c137bd359c4f5bddde7b75fd4", + "sha256:c709d8bda72cf4cd348ccec2a4881f2c5848fd72903c185f363d361b2737f773", + "sha256:c968a6aa7e0b56ecbd28531ddf439c2ec103610d3e2bf3b75b813304f8cb7723", + "sha256:df785d8cb80539d0b55fd47183264b7002077859028dfe3070cf6359bf8b2d9c", + "sha256:f406628ca51e0ae90ae76ea8398677a921b36f0bd71aab2099dfed08abd0322f", + "sha256:f46087bbd95ebae244a0eda01a618aff11ec7a069b15a3ef8f6b520db523dcf1", + "sha256:f8019c5279eb32360ca03e9fac40a12667715546eed5c5eb59eb381f2f501260", + "sha256:fc5f4d209733750afd2714e9109816a29500718b32dd9a5db01c0cb3a019b96a" ], "index": "pypi", - "version": "==4.5.2" + "version": "==4.5.3" }, "coveralls": { "hashes": [ - "sha256:ab638e88d38916a6cedbf80a9cd8992d5fa55c77ab755e262e00b36792b7cd6d", - "sha256:b2388747e2529fa4c669fb1e3e2756e4e07b6ee56c7d9fce05f35ccccc913aa0" + "sha256:baa26648430d5c2225ab12d7e2067f75597a4b967034bba7e3d5ab7501d207a1", + "sha256:ff9b7823b15070f26f654837bb02a201d006baaf2083e0514ffd3b34a3ffed81" ], "index": "pypi", - "version": "==1.5.1" + "version": "==1.7.0" }, "docopt": { "hashes": [ @@ -454,13 +599,21 @@ ], "version": "==0.6.2" }, + "docutils": { + "hashes": [ + "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6", + "sha256:51e64ef2ebfb29cae1faa133b3710143496eca21c530f3f71424d77687764274", + "sha256:7a4bd47eaf6596e1295ecb11361139febe29b084a87bf005bf899f9a42edc3c6" + ], + "version": "==0.14" + }, "flask": { "hashes": [ - "sha256:2ea22336f6d388b4b242bc3abf8a01244a8aa3e236e7407469ef78c16ba355dd", - "sha256:6c02dbaa5a9ef790d8219bdced392e2d549c10cd5a5ba4b6aa65126b2271af29" + "sha256:2271c0070dbcb5275fad4a82e29f23ab92682dc45f9dfbc22c02ba9b9322ce48", + "sha256:a080b744b7e345ccfcbc77954861cb05b3c63786e93f2b3875e0913d44b43f05" ], "index": "pypi", - "version": "==0.12.4" + "version": "==1.0.2" }, "flask-testing": { "hashes": [ @@ -471,12 +624,12 @@ }, "hypothesis": { "hashes": [ - "sha256:3fbe76ffce956ed02d57c48f0545e71150c30a35ec3138dac6e5c1b487775f00", - "sha256:54d63ceaa6fd7dca6797936811a08c5ef4f32840308f278b8011a0edc8078d5f", - "sha256:c5fb17cdf518aa90831644f3231a783ac2c1e220cf599098065de2676c3d33a6" + "sha256:4801b8175e2047abc8034fa1d4bce1a686225aefcfa3e0011d6c08f0324a807d", + "sha256:db7d64776d7c0fd04c874c5ab7d32d1b45c7d5d434fd130ce7746ec30c64bfc2", + "sha256:e86040269da16622fcc2771a211f93ce93e10fc8fb1f6f622dcd4507ae5670a4" ], "index": "pypi", - "version": "==4.5.4" + "version": "==4.17.1" }, "idna": { "hashes": [ @@ -485,6 +638,13 @@ ], "version": "==2.8" }, + "imagesize": { + "hashes": [ + "sha256:3f349de3eb99145973fefb7dbe38554414e5c30abd0c8e4b970a7c9d09f3a1d8", + "sha256:f3832918bc3c66617f92e35f5d70729187676313caa60c187eb0f28b8fe5e3b5" + ], + "version": "==1.1.0" + }, "inflect": { "hashes": [ "sha256:4ded1b2a6fcf0fc0397419c7727f131a93b67b80d899f2973be7758628e12b73", @@ -494,11 +654,10 @@ }, "isort": { "hashes": [ - "sha256:1153601da39a25b14ddc54955dbbacbb6b2d19135386699e2ad58517953b34af", - "sha256:b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8", - "sha256:ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497" + "sha256:01cb7e1ca5e6c5b3f235f0385057f70558b70d2f00320208825fa62887292f43", + "sha256:268067462aed7eb2a1e237fcb287852f22077de3fb07964e87e00f829eea2d1a" ], - "version": "==4.3.4" + "version": "==4.3.17" }, "itsdangerous": { "hashes": [ @@ -509,11 +668,11 @@ }, "jinja2": { "hashes": [ - "sha256:74c935a1b8bb9a3947c50a54766a969d4846290e1e788ea44c1392163723c3bd", - "sha256:f84be1bb0040caca4cea721fcbbbbd61f9be9464ca236387158b0feea01914a4" + "sha256:065c4f02ebe7f7cf559e49ee5a95fb800a9e4528727aec6f24402a5374c65013", + "sha256:14dd6caf1527abb21f08f86c784eac40853ba93edb79552aa1e4b8aef1b61c7b" ], "index": "pypi", - "version": "==2.10" + "version": "==2.10.1" }, "lazy-object-proxy": { "hashes": [ @@ -551,37 +710,37 @@ }, "markupsafe": { "hashes": [ - "sha256:048ef924c1623740e70204aa7143ec592504045ae4429b59c30054cb31e3c432", - "sha256:130f844e7f5bdd8e9f3f42e7102ef1d49b2e6fdf0d7526df3f87281a532d8c8b", - "sha256:19f637c2ac5ae9da8bfd98cef74d64b7e1bb8a63038a3505cd182c3fac5eb4d9", - "sha256:1b8a7a87ad1b92bd887568ce54b23565f3fd7018c4180136e1cf412b405a47af", - "sha256:1c25694ca680b6919de53a4bb3bdd0602beafc63ff001fea2f2fc16ec3a11834", - "sha256:1f19ef5d3908110e1e891deefb5586aae1b49a7440db952454b4e281b41620cd", - "sha256:1fa6058938190ebe8290e5cae6c351e14e7bb44505c4a7624555ce57fbbeba0d", - "sha256:31cbb1359e8c25f9f48e156e59e2eaad51cd5242c05ed18a8de6dbe85184e4b7", - "sha256:3e835d8841ae7863f64e40e19477f7eb398674da6a47f09871673742531e6f4b", - "sha256:4e97332c9ce444b0c2c38dd22ddc61c743eb208d916e4265a2a3b575bdccb1d3", - "sha256:525396ee324ee2da82919f2ee9c9e73b012f23e7640131dd1b53a90206a0f09c", - "sha256:52b07fbc32032c21ad4ab060fec137b76eb804c4b9a1c7c7dc562549306afad2", - "sha256:52ccb45e77a1085ec5461cde794e1aa037df79f473cbc69b974e73940655c8d7", - "sha256:5c3fbebd7de20ce93103cb3183b47671f2885307df4a17a0ad56a1dd51273d36", - "sha256:5e5851969aea17660e55f6a3be00037a25b96a9b44d2083651812c99d53b14d1", - "sha256:5edfa27b2d3eefa2210fb2f5d539fbed81722b49f083b2c6566455eb7422fd7e", - "sha256:7d263e5770efddf465a9e31b78362d84d015cc894ca2c131901a4445eaa61ee1", - "sha256:83381342bfc22b3c8c06f2dd93a505413888694302de25add756254beee8449c", - "sha256:857eebb2c1dc60e4219ec8e98dfa19553dae33608237e107db9c6078b1167856", - "sha256:98e439297f78fca3a6169fd330fbe88d78b3bb72f967ad9961bcac0d7fdd1550", - "sha256:bf54103892a83c64db58125b3f2a43df6d2cb2d28889f14c78519394feb41492", - "sha256:d9ac82be533394d341b41d78aca7ed0e0f4ba5a2231602e2f05aa87f25c51672", - "sha256:e982fe07ede9fada6ff6705af70514a52beb1b2c3d25d4e873e82114cf3c5401", - "sha256:edce2ea7f3dfc981c4ddc97add8a61381d9642dc3273737e756517cc03e84dd6", - "sha256:efdc45ef1afc238db84cb4963aa689c0408912a0239b0721cb172b4016eb31d6", - "sha256:f137c02498f8b935892d5c0172560d7ab54bc45039de8805075e19079c639a9c", - "sha256:f82e347a72f955b7017a39708a3667f106e6ad4d10b25f237396a7115d8ed5fd", - "sha256:fb7c206e01ad85ce57feeaaa0bf784b97fa3cad0d4a5737bc5295785f5c613a1" + "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", + "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", + "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", + "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", + "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", + "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", + "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", + "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", + "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", + "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", + "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", + "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905", + "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735", + "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d", + "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e", + "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d", + "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c", + "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21", + "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2", + "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5", + "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b", + "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", + "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", + "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7" ], "index": "pypi", - "version": "==1.1.0" + "version": "==1.1.1" }, "mccabe": { "hashes": [ @@ -592,48 +751,40 @@ }, "more-itertools": { "hashes": [ - "sha256:38a936c0a6d98a38bcc2d03fdaaedaba9f412879461dd2ceff8d37564d6522e4", - "sha256:c0a5785b1109a6bd7fac76d6837fd1feca158e54e521ccd2ae8bfe393cc9d4fc", - "sha256:fe7a7cae1ccb57d33952113ff4fa1bc5f879963600ed74918f1236e212ee50b9" - ], - "version": "==5.0.0" - }, - "mypy": { - "hashes": [ - "sha256:986a7f97808a865405c5fd98fae5ebfa963c31520a56c783df159e9a81e41b3e", - "sha256:cc5df73cc11d35655a8c364f45d07b13c8db82c000def4bd7721be13356533b4" + "sha256:2112d2ca570bb7c3e53ea1a35cd5df42bb0fd10c45f0fb97178679c3c03d64c7", + "sha256:c3e4748ba1aad8dba30a4886b0b1a2004f9a863837b8654e7059eebf727afa5a" ], - "index": "pypi", - "version": "==0.660" + "markers": "python_version > '2.7'", + "version": "==7.0.0" }, - "mypy-extensions": { + "nose2": { "hashes": [ - "sha256:37e0e956f41369209a3d5f34580150bcacfabaa57b33a15c0b25f4b5725e0812", - "sha256:b16cabe759f55e3409a7d231ebd2841378fb0c27a5d1994719e340e4f429ac3e" + "sha256:0ede156fd7974fa40893edeca0b709f402c0ccacd7b81b22e76f73c116d1b999", + "sha256:31d8beb00aed3ccc6efb1742bb90227d883e471715188249f594310676e0ef0e" ], "index": "pypi", - "version": "==0.4.1" + "version": "==0.9.1" }, - "nose2": { + "packaging": { "hashes": [ - "sha256:9052f2b46807b63d9bdf68e0768da1f8386368889b50043fd5d0889c470258f3" + "sha256:0c98a5d0be38ed775798ece1b9727178c4469d9c3b4ada66e8e6b7849f8732af", + "sha256:9e1cbf8c12b1f1ce0bb5344b8d7ecf66a6f8a6e91bcb0c84593ed6d3ab5c4ab3" ], - "index": "pypi", - "version": "==0.8.0" + "version": "==19.0" }, "pluggy": { "hashes": [ - "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616", - "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a" + "sha256:19ecf9ce9db2fce065a7a0586e07cfb4ac8614fe96edf628a264b1c70116cf8f", + "sha256:84d306a647cc805219916e62aab89caa97a33a1dd8c342e87a37f91073cd4746" ], - "version": "==0.8.1" + "version": "==0.9.0" }, "py": { "hashes": [ - "sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694", - "sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6" + "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", + "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" ], - "version": "==1.7.0" + "version": "==1.8.0" }, "pycodestyle": { "hashes": [ @@ -654,10 +805,17 @@ }, "pyflakes": { "hashes": [ - "sha256:5e8c00e30c464c99e0b501dc160b13a14af7f27d4dffb529c556e30a159e231d", - "sha256:f277f9ca3e55de669fba45b7393a1449009cff5a37d1af10ebb76c52765269cd" + "sha256:17dbeb2e3f4d772725c777fabc446d5634d1038f234e77343108ce445ea69ce0", + "sha256:d976835886f8c5b31d47970ed689944a0262b5f3afa00a5a7b4dc81e5449f8a2" ], - "version": "==2.1.0" + "version": "==2.1.1" + }, + "pygments": { + "hashes": [ + "sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a", + "sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d" + ], + "version": "==2.3.1" }, "pyhamcrest": { "hashes": [ @@ -669,27 +827,34 @@ }, "pylama": { "hashes": [ - "sha256:7e0327ee9b2a350ed73fe54c240894e534e2bccfb23a59ed5ce89f5a5689ee94", - "sha256:f81bf3bbd15db802b620903df491e5cd6469dcd542424ce6718425037dcc4d10" + "sha256:9bae53ef9c1a431371d6a8dca406816a60d547147b60a4934721898f553b7d8f", + "sha256:fd61c11872d6256b019ef1235be37b77c922ef37ac9797df6bd489996dddeb15" ], "index": "pypi", - "version": "==7.6.6" + "version": "==7.7.1" }, "pylint": { "hashes": [ - "sha256:689de29ae747642ab230c6d37be2b969bf75663176658851f456619aacf27492", - "sha256:771467c434d0d9f081741fec1d64dfb011ed26e65e12a28fe06ca2f61c4d556c" + "sha256:2bf4bd58d6d5d87174fbc9d1d134a9aeee852d4dc29cbd422a7015772770bc63", + "sha256:ee80c7af4f127b2a480d83010c9f0e97beb8eaa652b78c2837d3ed30b12e1182" ], "index": "pypi", - "version": "==2.2.2" + "version": "==2.3.0" + }, + "pyparsing": { + "hashes": [ + "sha256:1873c03321fc118f4e9746baf201ff990ceb915f433f23b395f5580d1840cb2a", + "sha256:9b6323ef4ab914af344ba97510e966d64ba91055d6b9afa6b30799340e89cc03" + ], + "version": "==2.4.0" }, "pytest": { "hashes": [ - "sha256:65aeaa77ae87c7fc95de56285282546cfa9c886dc8e5dc78313db1c25e21bc07", - "sha256:6ac6d467d9f053e95aaacd79f831dbecfe730f419c6c7022cb316b365cd9199d" + "sha256:3773f4c235918987d51daf1db66d51c99fac654c81d6f2f709a046ab446d5e5d", + "sha256:b7802283b70ca24d7119b32915efa7c409982f59913c1a6c0640aacf118b95f5" ], "index": "pypi", - "version": "==4.2.0" + "version": "==4.4.1" }, "pytest-easyread": { "hashes": [ @@ -713,6 +878,14 @@ ], "version": "==1.8.0" }, + "pytz": { + "hashes": [ + "sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda", + "sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141" + ], + "index": "pypi", + "version": "==2019.1" + }, "requests": { "hashes": [ "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", @@ -736,10 +909,68 @@ }, "soupsieve": { "hashes": [ - "sha256:466910df7561796a60748826781ebe9a888f7a1668a636ae86783f44d10aae73", - "sha256:87db12ae79194f0ff9808d2b1641c4f031ae39ffa3cab6b907ea7c1e5e5ed445" + "sha256:6898e82ecb03772a0d82bd0d0a10c0d6dcc342f77e0701d0ec4a8271be465ece", + "sha256:b20eff5e564529711544066d7dc0f7661df41232ae263619dede5059799cdfca" + ], + "version": "==1.9.1" + }, + "sphinx": { + "hashes": [ + "sha256:423280646fb37944dd3c85c58fb92a20d745793a9f6c511f59da82fa97cd404b", + "sha256:de930f42600a4fef993587633984cc5027dedba2464bcf00ddace26b40f8d9ce" + ], + "index": "pypi", + "version": "==2.0.1" + }, + "sphinx-autodoc-typehints": { + "hashes": [ + "sha256:19fe0b426b7c008181f67f816060da7f046bd8a42723f67a685d26d875bcefd7", + "sha256:f9c06acfec80766fe8f542a6d6a042e751fcf6ce2e2711a7dc00d8b6daf8aa36" + ], + "index": "pypi", + "version": "==1.6.0" + }, + "sphinxcontrib-applehelp": { + "hashes": [ + "sha256:edaa0ab2b2bc74403149cb0209d6775c96de797dfd5b5e2a71981309efab3897", + "sha256:fb8dee85af95e5c30c91f10e7eb3c8967308518e0f7488a2828ef7bc191d0d5d" + ], + "version": "==1.0.1" + }, + "sphinxcontrib-devhelp": { + "hashes": [ + "sha256:6c64b077937330a9128a4da74586e8c2130262f014689b4b89e2d08ee7294a34", + "sha256:9512ecb00a2b0821a146736b39f7aeb90759834b07e81e8cc23a9c70bacb9981" + ], + "version": "==1.0.1" + }, + "sphinxcontrib-htmlhelp": { + "hashes": [ + "sha256:4670f99f8951bd78cd4ad2ab962f798f5618b17675c35c5ac3b2132a14ea8422", + "sha256:d4fd39a65a625c9df86d7fa8a2d9f3cd8299a3a4b15db63b50aac9e161d8eff7" + ], + "version": "==1.0.2" + }, + "sphinxcontrib-jsmath": { + "hashes": [ + "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", + "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" + ], + "version": "==1.0.1" + }, + "sphinxcontrib-qthelp": { + "hashes": [ + "sha256:513049b93031beb1f57d4daea74068a4feb77aa5630f856fcff2e50de14e9a20", + "sha256:79465ce11ae5694ff165becda529a600c754f4bc459778778c7017374d4d406f" + ], + "version": "==1.0.2" + }, + "sphinxcontrib-serializinghtml": { + "hashes": [ + "sha256:c0efb33f8052c04fd7a26c0a07f1678e8512e0faec19f4aa8f2473a8b81d5227", + "sha256:db6615af393650bf1151a6cd39120c29abaf93cc60db8c48eb2dddbfdc3a9768" ], - "version": "==1.7.3" + "version": "==1.1.3" }, "sqlacodegen": { "hashes": [ @@ -751,45 +982,35 @@ }, "sqlalchemy": { "hashes": [ - "sha256:52a42dbf02d0562d6e90e7af59f177f1cc027e72833cc29c3a821eefa009c71d" + "sha256:91c54ca8345008fceaec987e10924bf07dcab36c442925357e5a467b36a38319" ], "index": "pypi", - "version": "==1.2.17" + "version": "==1.3.3" }, "typed-ast": { "hashes": [ - "sha256:023625bfa9359e29bd6e24cac2a4503495b49761d48a5f1e38333fc4ac4d93fe", - "sha256:07591f7a5fdff50e2e566c4c1e9df545c75d21e27d98d18cb405727ed0ef329c", - "sha256:153e526b0f4ffbfada72d0bb5ffe8574ba02803d2f3a9c605c8cf99dfedd72a2", - "sha256:3ad2bdcd46a4a1518d7376e9f5016d17718a9ed3c6a3f09203d832f6c165de4a", - "sha256:3ea98c84df53ada97ee1c5159bb3bc784bd734231235a1ede14c8ae0775049f7", - "sha256:51a7141ccd076fa561af107cfb7a8b6d06a008d92451a1ac7e73149d18e9a827", - "sha256:52c93cd10e6c24e7ac97e8615da9f224fd75c61770515cb323316c30830ddb33", - "sha256:6344c84baeda3d7b33e157f0b292e4dd53d05ddb57a63f738178c01cac4635c9", - "sha256:64699ca1b3bd5070bdeb043e6d43bc1d0cebe08008548f4a6bee782b0ecce032", - "sha256:74903f2e56bbffe29282ef8a5487d207d10be0f8513b41aff787d954a4cf91c9", - "sha256:7891710dba83c29ee2bd51ecaa82f60f6bede40271af781110c08be134207bf2", - "sha256:91976c56224e26c256a0de0f76d2004ab885a29423737684b4f7ebdd2f46dde2", - "sha256:9bad678a576ecc71f25eba9f1e3fd8d01c28c12a2834850b458428b3e855f062", - "sha256:b4726339a4c180a8b6ad9d8b50d2b6dc247e1b79b38fe2290549c98e82e4fd15", - "sha256:ba36f6aa3f8933edf94ea35826daf92cbb3ec248b89eccdc053d4a815d285357", - "sha256:bbc96bde544fd19e9ef168e4dfa5c3dfe704bfa78128fa76f361d64d6b0f731a", - "sha256:c0c927f1e44469056f7f2dada266c79b577da378bbde3f6d2ada726d131e4824", - "sha256:c0f9a3708008aa59f560fa1bd22385e05b79b8e38e0721a15a8402b089243442", - "sha256:f0bf6f36ff9c5643004171f11d2fdc745aa3953c5aacf2536a0685db9ceb3fb1", - "sha256:f5be39a0146be663cbf210a4d95c3c58b2d7df7b043c9047c5448e358f0550a2", - "sha256:fcd198bf19d9213e5cbf2cde2b9ef20a9856e716f76f9476157f90ae6de06cc6" - ], - "markers": "python_version < '3.7' and implementation_name == 'cpython'", - "version": "==1.2.0" + "sha256:04894d268ba6eab7e093d43107869ad49e7b5ef40d1a94243ea49b352061b200", + "sha256:16616ece19daddc586e499a3d2f560302c11f122b9c692bc216e821ae32aa0d0", + "sha256:2af80a373af123d0b9f44941a46df67ef0ff7a60f95872412a145f4500a7fc99", + "sha256:2ea99c029ebd4b5a308d915cc7fb95b8e1201d60b065450d5d26deb65d3f2bc1", + "sha256:56b6978798502ef66625a2e0f80cf923da64e328da8bbe16c1ff928c70c873de", + "sha256:644ee788222d81555af543b70a1098f2025db38eaa99226f3a75a6854924d4db", + "sha256:64cf762049fc4775efe6b27161467e76d0ba145862802a65eefc8879086fc6f8", + "sha256:68c362848d9fb71d3c3e5f43c09974a0ae319144634e7a47db62f0f2a54a7fa7", + "sha256:6c1f3c6f6635e611d58e467bf4371883568f0de9ccc4606f17048142dec14a1f", + "sha256:b213d4a02eec4ddf622f4d2fbc539f062af3788d1f332f028a2e19c42da53f15", + "sha256:c9d414512eaa417aadae7758bc118868cd2396b0e6138c1dd4fda96679c079d3", + "sha256:fb96a6e2c11059ecf84e6741a319f93f683e440e341d4489c9b161eca251cf2a" + ], + "version": "==1.3.4" }, "urllib3": { "hashes": [ - "sha256:61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", - "sha256:de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22" + "sha256:4c291ca23bbb55c76518905869ef34bdd5f0e46af7afe6861e8375643ffee1a0", + "sha256:9a247273df709c4fedb38c711e44292304f73f39ab01beda9f6b9fc375669ac3" ], "markers": "python_version >= '3.4'", - "version": "==1.24.1" + "version": "==1.24.2" }, "weighted-levenshtein": { "hashes": [ @@ -800,10 +1021,10 @@ }, "werkzeug": { "hashes": [ - "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c", - "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b" + "sha256:0a73e8bb2ff2feecfc5d56e6f458f5b99290ef34f565ffb2665801ff7de6af7a", + "sha256:7fad9770a8778f9576693f0cc29c7dcc36964df916b83734f4431c0e612a7fbc" ], - "version": "==0.14.1" + "version": "==0.15.2" }, "wrapt": { "hashes": [ diff --git a/browse/config.py b/browse/config.py index d133d1caf..01dbed793 100644 --- a/browse/config.py +++ b/browse/config.py @@ -1,12 +1,14 @@ -""" -Flask configuration. +"""Flask configuration. Docstrings are from the `Flask configuration documentation `_. """ import os +import warnings +import dateutil.parser +from datetime import datetime, timedelta -VERSION = '0.1.1' +VERSION = '0.2.1' """The application version """ ON = 'yes' @@ -204,46 +206,141 @@ # SQLAlchemy configuration # For mysql: 'mysql://user:pass@localhost/dbname' -SQLALCHEMY_DATABASE_URI = os.environ.get( - 'BROWSE_SQLALCHEMY_DATABASE_URI', 'sqlite:///../tests/data/browse.db') +SQLALCHEMY_DATABASE_URI = os.environ.get('BROWSE_SQLALCHEMY_DATABASE_URI', + os.environ.get('SQLALCHEMY_DATABASE_URI', + 'sqlite:///../tests/data/browse.db')) +"""SQLALCHEMY_DATABASE_URI is pulled from +BROWSE_SQLALCHEMY_DATABASE_URI. If it is not there the +SQLALCHEMY_DATABASE_URI is checked. If that is not set, the SQLITE +test DB is used. + +If neither of those is set and TESTING is the string 'yes', then a +SQLITE test DB is used. +""" + +if os.environ.get('FLASK_ENV', False) == 'production' \ + and 'sqlite' in SQLALCHEMY_DATABASE_URI: + warnings.warn("Using sqlite in BROWSE_SQLALCHEMY_DATABASE_URI in production environment") + SQLALCHEMY_TRACK_MODIFICATIONS = False SQLALCHEMY_ECHO = False SQLALCHEMY_RECORD_QUERIES = False -# SQLALCHEMY_POOL_SIZE and SQLALCHEMY_MAX_OVERFLOW are set without defaults -# because they will not work with sqlite -# SQLALCHEMY_POOL_SIZE = int(os.environ.get('BROWSE_SQLALCHEMY_POOL_SIZE')) -# SQLALCHEMY_MAX_OVERFLOW = int(os.environ.get('BROWSE_SQLALCHEMY_MAX_OVERFLOW')) - -# Disable DB queries even if other SQLAlchemy config are defined -# This, for example, could be used in conjunction with the `no-write` runlevel -# in the legacy infrastructure, which is a case where we know the DB is -# unavailable and thus intentionally bypass any DB access. -BROWSE_DISABLE_DATABASE = os.environ.get('BROWSE_DISABLE_DATABASE', False) -# Enable/disable Piwik (Matomo) web analytics -BROWSE_PIWIK_ENABLED = os.environ.get('BROWSE_PIWIK_ENABLED', False) +SQLALCHEMY_POOL_SIZE = int(os.environ.get('BROWSE_SQLALCHEMY_POOL_SIZE', '10')) +"""SQLALCHEMY_POOL_SIZE is set from BROWSE_SQLALCHEMY_POOL_SIZE. + +Ignored under sqlite.""" + +SQLALCHEMY_MAX_OVERFLOW = int(os.environ.get('BROWSE_SQLALCHEMY_MAX_OVERFLOW', '0')) +"""SQLALCHEMY_MAX_OVERFLOW is set from BROWSE_SQLALCHEMY_MAX_OVERFLOW. + +Ignored under sqlite.""" + +# SQLALCHEMY_POOL_SIZE and SQLALCHEMY_MAX_OVERFLOW will not work with sqlite +if 'sqlite' in SQLALCHEMY_DATABASE_URI: + SQLALCHEMY_POOL_SIZE = None + SQLALCHEMY_MAX_OVERFLOW = None + +BROWSE_DAILY_STATS_PATH = os.environ.get( + 'BROWSE_DAILY_STATS_PATH', 'tests/data/daily_stats') +"""The classic home page uses this file to get the total paper count +The file contains one line, with key "total_papers" and an integer, e.g. +total_papers 1456755.""" + +BROWSE_DISABLE_DATABASE = os.environ.get('BROWSE_DISABLE_DATABASE', False) +"""Disable DB queries even if other SQLAlchemy config are defined +This, for example, could be used in conjunction with the `no-write` runlevel +in the legacy infrastructure, which is a case where we know the DB is +unavailable and thus intentionally bypass any DB access.""" + +BROWSE_SITE_LABEL = os.environ.get('BROWSE_SITE_LABEL', 'arXiv.org') +BROWSE_SITE_HOST = os.environ.get('BROWSE_SITE_HOST', None) +"""This is similar to, but decoupled from SERVER_NAME.""" + +BROWSE_ANALYTICS_ENABLED = os.environ.get('BROWSE_ANALYTICS_ENABLED', False) +"""Enable/disable Matomo web analytics.""" +BROWSE_ANALYTICS_BASE_URL = os.environ.get('BROWSE_ANALYTICS_BASE_URL', + 'https://webstats.arxiv.org/') +"""Base URL for tracker. Should include trailing slash.""" +BROWSE_ANALYTICS_COOKIE_DOMAIN = \ + os.environ.get('BROWSE_ANALYTICS_COOKIE_DOMAIN', '*.arxiv.org') +"""Cookie domain for tracker.""" +BROWSE_ANALYTICS_SITE_ID = os.environ.get('BROWSE_ANALYTICS_SITE_ID', '1') +"""Tracker site ID.""" -# Enable/disable user banner BROWSE_USER_BANNER_ENABLED = os.environ.get( 'BROWSE_USER_BANNER_ENABLED', False) +"""Enable/disable user banner.""" +try: + BROWSE_USER_BANNER_START_DATE = dateutil.parser.parse( + os.environ.get('BROWSE_USER_BANNER_START_DATE') + ).replace(hour=0, minute=0, second=0) +except Exception: + warnings.warn("Bad value for BROWSE_USER_BANNER_START_DATE") + BROWSE_USER_BANNER_START_DATE = datetime.now() - timedelta(days=1) + +try: + BROWSE_USER_BANNER_END_DATE = dateutil.parser.parse( + os.environ.get('BROWSE_USER_BANNER_END_DATE') + ).replace(hour=23, minute=59, second=59) +except Exception: + warnings.warn("Bad value for BROWSE_USER_BANNER_END_DATE") + BROWSE_USER_BANNER_END_DATE = datetime.now() + timedelta(days=1) -# Paths to .abs and source files DOCUMENT_LATEST_VERSIONS_PATH = os.environ.get( 'DOCUMENT_LATEST_VERSIONS_PATH', 'tests/data/abs_files/ftp') +"""Paths to .abs and source files.""" DOCUMENT_ORIGNAL_VERSIONS_PATH = os.environ.get( 'DOCUMENT_ORIGNAL_VERSIONS_PATH', 'tests/data/abs_files/orig') +"""Paths to .abs and source files.""" -# Path to cache directory DOCUMENT_CACHE_PATH = os.environ.get( - 'DOCUMENT_CACHE_PATH', 'tests/data/cache' -) + 'DOCUMENT_CACHE_PATH', 'tests/data/cache') +"""Path to cache directory""" -# Used in linking to /show-email SHOW_EMAIL_SECRET = os.environ.get('SHOW_EMAIL_SECRET', 'foo') +"""Used in linking to /show-email.""" -# Used in linking to /ct CLICKTHROUGH_SECRET = os.environ.get('CLICKTHROUGH_SECRET', 'bar') +"""Used in linking to /ct.""" + +TRACKBACK_SECRET = os.environ.get('TRACKBACK_SECRET', 'baz') +"""Used in linking to trackbacks in /tb pages.""" -# arXiv Labs options LABS_BIBEXPLORER_ENABLED = os.environ.get('LABS_BIBEXPLORER_ENABLED', True) +"""arXiv Labs bibex enabled/disabled.""" + +# Auth settings +AUTH_SESSION_COOKIE_NAME = 'ARXIVNG_SESSION_ID' +AUTH_SESSION_COOKIE_DOMAIN = os.environ.get( + 'AUTH_SESSION_COOKIE_DOMAIN', '.arxiv.org') +AUTH_SESSION_COOKIE_SECURE = bool( + int(os.environ.get('AUTH_SESSION_COOKIE_SECURE', '1'))) +AUTH_UPDATED_SESSION_REF = True + +CLASSIC_COOKIE_NAME = os.environ.get('CLASSIC_COOKIE_NAME', 'tapir_session') +CLASSIC_PERMANENT_COOKIE_NAME = os.environ.get( + 'CLASSIC_PERMANENT_COOKIE_NAME', + 'tapir_permanent' +) +CLASSIC_TRACKING_COOKIE = os.environ.get('CLASSIC_TRACKING_COOKIE', 'browser') +CLASSIC_DATABASE_URI = os.environ.get('CLASSIC_DATABASE_URI', os.environ.get( + 'BROWSE_SQLALCHEMY_DATABASE_URI', default=None)) +"""If not set, legacy database integrations for auth will not be available.""" +if not CLASSIC_DATABASE_URI: + warnings.warn("No value set for CLASSIC_DATABASE_URI") +elif 'sqlite' in CLASSIC_DATABASE_URI: + warnings.warn("Using sqlite in CLASSIC_DATABASE_URI") + +CLASSIC_SESSION_HASH = os.environ.get('CLASSIC_SESSION_HASH', 'foosecret') +SESSION_DURATION = os.environ.get( + 'SESSION_DURATION', + '36000' +) + +URLS = [ + ('ui.login', '/login', os.environ.get('SERVER_NAME', 'arxiv.org')) + # This is a temporary workaround for ARXIVNG-2063 +] +"""External URLs.""" diff --git a/browse/controllers/__init__.py b/browse/controllers/__init__.py index c309f5128..e327b9c57 100644 --- a/browse/controllers/__init__.py +++ b/browse/controllers/__init__.py @@ -1,6 +1,37 @@ -""" -Houses controllers for browse. +"""Houses controllers for browse. -Each controller corresponds to a distinct browse feature with its own request -handling logic. +Each controller corresponds to a distinct browse feature with its own +request handling logic. """ + +from flask import url_for +from typing import Any, Dict, Optional, Tuple + +from arxiv import status +from browse.domain.identifier import Identifier + +Response = Tuple[Dict[str, Any], int, Dict[str, Any]] + + +def check_supplied_identifier(id: Identifier, route: str) -> Optional[Response]: + """Provide redirect URL if supplied ID does not match parsed ID. + + Parameters + ---------- + arxiv_identifier : :class:`Identifier` + route : str + The route to use in creating the redirect response with arxiv_id + + Returns + ------- + redirect_url: str + A redirect URL that uses a canonical arXiv identifier. + """ + if not id or id.ids == id.id or id.ids == id.idv: + return None + + arxiv_id = id.idv if id.has_version else id.id + redirect_url: str = url_for(route, arxiv_id=arxiv_id) + return {},\ + status.HTTP_301_MOVED_PERMANENTLY,\ + {'Location': redirect_url} diff --git a/browse/controllers/abs_page/__init__.py b/browse/controllers/abs_page/__init__.py index 836f09b53..d05607dcf 100644 --- a/browse/controllers/abs_page/__init__.py +++ b/browse/controllers/abs_page/__init__.py @@ -1,8 +1,7 @@ -""" -Handle requests to support the abs feature. +"""Handle requests to support the abs feature. -The primary entrypoint to this module is :func:`.get_abs_page`, which handles -GET requests to the abs endpoint. +The primary entrypoint to this module is :func:`.get_abs_page`, which +handles GET requests to the abs endpoint. """ import re @@ -18,6 +17,7 @@ from arxiv import status, taxonomy from arxiv.base import logging +from browse.controllers import check_supplied_identifier from browse.domain.metadata import DocMetadata from browse.domain.category import Category from browse.exceptions import AbsNotFound @@ -47,8 +47,7 @@ def get_abs_page(arxiv_id: str) -> Response: - """ - Get abs page data from the document metadata service. + """Get abs page data from the document metadata service. Parameters ---------- @@ -70,7 +69,6 @@ def get_abs_page(arxiv_id: str) -> Response: ------ :class:`.InternalServerError` Raised when there was an unexpected problem executing the query. - """ response_data: Dict[str, Any] = {} response_headers: Dict[str, Any] = {} @@ -78,7 +76,8 @@ def get_abs_page(arxiv_id: str) -> Response: arxiv_id = _check_legacy_id_params(arxiv_id) arxiv_identifier = Identifier(arxiv_id=arxiv_id) - redirect = _check_supplied_identifier(arxiv_identifier) + redirect = check_supplied_identifier(arxiv_identifier, + 'browse.abstract') if redirect: return redirect @@ -149,32 +148,6 @@ def get_abs_page(arxiv_id: str) -> Response: return response_data, response_status, response_headers -def _check_supplied_identifier(id: Identifier) -> Optional[Response]: - """ - Provide redirect URL if supplied ID does not match parsed ID. - - Parameters - ---------- - arxiv_identifier : :class:`Identifier` - - Returns - ------- - redirect_url: str - A `browse.abstract` redirect URL that uses the canonical - arXiv identifier. - - """ - if not id or id.ids == id.id or id.ids == id.idv: - return None - - arxiv_id = id.idv if id.has_version else id.id - redirect_url: str = url_for('browse.abstract', - arxiv_id=arxiv_id) - return {},\ - status.HTTP_301_MOVED_PERMANENTLY,\ - {'Location': redirect_url} - - def _non_critical_abs_data(abs_meta: DocMetadata, arxiv_identifier: Identifier, response_data: Dict)->None: @@ -249,7 +222,7 @@ def _time_header_parse(headers: Dict[str, Any], header: str) \ if (header in request.headers and request.headers[header] is not None): try: - dt = parser.parse(request.headers.get(header)) + dt = parser.parse(str(request.headers.get(header))) if not dt.tzinfo: dt = dt.replace(tzinfo=tzutc()) return dt @@ -261,8 +234,7 @@ def _time_header_parse(headers: Dict[str, Any], header: str) \ def _check_legacy_id_params(arxiv_id: str) -> str: - """ - Check for legacy request parameters related to old arXiv identifiers. + """Check for legacy request parameters related to old arXiv identifiers. Parameters ---------- @@ -272,7 +244,6 @@ def _check_legacy_id_params(arxiv_id: str) -> str: ------- arxiv_id: str A possibly modified version of the input arxiv_id string. - """ if request.args and '/' not in arxiv_id: # To support old references to /abs/?papernum=\d{7} @@ -291,8 +262,7 @@ def _check_legacy_id_params(arxiv_id: str) -> str: def _check_context(arxiv_identifier: Identifier, primary_category: Optional[Category], response_data: Dict[str, Any]) -> None: - """ - Check context in request parameters and update response accordingly. + """Check context in request parameters and update response accordingly. Parameters ---------- @@ -302,7 +272,6 @@ def _check_context(arxiv_identifier: Identifier, Returns ------- Dict of values to add to response_data - """ # Set up the context context = None @@ -325,37 +294,30 @@ def _check_context(arxiv_identifier: Identifier, response_data['browse_context'] = context + next_url = None + prev_url = None if arxiv_identifier.is_old_id or context == 'arxiv': + # Revert to hybrid approach per ARXIVNG-2080 next_id = metadata.get_next_id(arxiv_identifier) - # TODO: might have to pass non-arxiv context to url_for becuase - # of examples like physics/9707012 if next_id: next_url = url_for('browse.abstract', arxiv_id=next_id.id, context='arxiv' if context == 'arxiv' else None) - else: - next_url = None - previous_id = metadata.get_previous_id(arxiv_identifier) if previous_id: prev_url = url_for('browse.abstract', arxiv_id=previous_id.id, context='arxiv' if context == 'arxiv' else None) - else: - prev_url = None - else: - # This is the case where the context is not in 'arxiv' or an archive, - # so just let the prevnext controller figure it out. - - # TODO do url_for() here - next_url = '/prevnext?site=arxiv.org&id=' + \ - arxiv_identifier.id + '&function=next' - prev_url = '/prevnext?site=arxiv.org&id=' + \ - arxiv_identifier.id + '&function=prev' - if context: - next_url = next_url + '&context=' + context - prev_url = prev_url + '&context=' + context + # Use prevnext controller to determine what the previous or next ID is. + next_url = url_for('browse.previous_next', + id=arxiv_identifier.id, + function='next', + context=context if context else None) + prev_url = url_for('browse.previous_next', + id=arxiv_identifier.id, + function='prev', + context=context if context else None) response_data['browse_context_previous_url'] = prev_url response_data['browse_context_next_url'] = next_url diff --git a/browse/controllers/archive_page/__init__.py b/browse/controllers/archive_page/__init__.py new file mode 100644 index 000000000..6f15b247b --- /dev/null +++ b/browse/controllers/archive_page/__init__.py @@ -0,0 +1,133 @@ +"""Archive landing page.""" + +import datetime +from typing import Dict, Any, Tuple, List, no_type_check + +from flask import Response, url_for + +from arxiv import status +from arxiv.taxonomy.definitions import ARCHIVES, CATEGORIES, ARCHIVES_SUBSUMED + +from browse.controllers.archive_page.by_month_form import ByMonthForm +from browse.controllers.years_operating import years_operating, stats_by_year +from browse.services.util.response_headers import abs_expires_header + + + +def get_archive(archive_id: str) -> Response: + """Gets archive page.""" + data: Dict[str, Any] = {} + response_headers: Dict[str, Any] = {} + + if archive_id == "list": + return archive_index(archive_id, status=status.HTTP_200_OK) + + archive = ARCHIVES.get(archive_id, None) + if not archive: + cat_id = CATEGORIES.get(archive_id, {}).get("in_archive", None) + archive = ARCHIVES.get(cat_id, None) + if not archive: + return archive_index(archive_id, + status=status.HTTP_404_NOT_FOUND) + else: + archive_id = cat_id + + _write_expires_header(response_headers) + + subsumed_by = ARCHIVES_SUBSUMED.get(archive_id, None) + if subsumed_by: + data["subsumed_id"] = archive_id + data["subsumed_category"] = CATEGORIES.get(archive_id, {}) + data["subsumed_by"] = subsumed_by + subsuming_category = CATEGORIES.get(subsumed_by, {}) + data["subsuming_category"] = subsuming_category + archive_id = subsuming_category.get("in_archive", None) + archive = ARCHIVES.get(archive_id, None) + + years = years_operating(archive) + + data["years"] = years + data["months"] = MONTHS + data["days"] = DAYS + + data["archive_id"] = archive_id + data["archive"] = archive + data["list_form"] = ByMonthForm(archive_id, archive, years) + data["stats_by_year"] = stats_by_year(archive_id, archive, years) + data["category_list"] = category_list(archive_id) + + data["catchup_to"] = datetime.date.today() - datetime.timedelta(days=7) + + data["template"] = "archive/single_archive.html" + return data, status.HTTP_200_OK, response_headers # type: ignore + + +def archive_index(archive_id: str, status: int) -> Response: + """Landing page for when there is no archive specified.""" + data: Dict[str, Any] = {} + data["bad_archive"] = archive_id + + archives = [ + (id, ARCHIVES[id]["name"]) + for id in ARCHIVES.keys() + if id not in ARCHIVES_SUBSUMED and not id.startswith("test") + ] + archives.sort(key=lambda tpl: tpl[0]) + data["archives"] = archives + + defunct = [ + (id, ARCHIVES[id]["name"], ARCHIVES_SUBSUMED.get(id, "")) + for id in ARCHIVES.keys() + if "end_date" in ARCHIVES[id] + ] + defunct.sort(key=lambda tpl: tpl[0]) + data["defunct"] = defunct + + data["template"] = "archive/archive_list_all.html" + return data, status, {} # type: ignore + + +def subsumed_msg(archive: Dict[str, str], subsumed_by: str) -> Dict[str, str]: + """Adds information about subsuming categories and archives.""" + sb = CATEGORIES.get(subsumed_by, {"name": "unknown category"}) + sa = ARCHIVES.get(sb.get("in_archive", None), {"name": "unknown archive"}) + + return {"subsumed_by_cat": sb, "subsumed_by_arch": sa} + + +def category_list(archive_id: str) -> List[Dict[str, str]]: + """Retunrs categories for archive.""" + cats = [] + for cat_id in CATEGORIES: + cat = CATEGORIES[cat_id] + if(cat.get("in_archive", "yuck") == archive_id + and cat.get("is_active", True)): + cats.append({"id": cat_id, + "name": cat.get("name", ""), + "description": cat.get("description", "")}) + + cats.sort(key=lambda x: x["name"]) + return cats + + +def _write_expires_header(response_headers: Dict[str, Any]) -> None: + """Writes an expires header for the response.""" + response_headers["Expires"] = abs_expires_header()[1] + + +DAYS = ["{:0>2d}".format(i) for i in range(1, 32)] + +MONTHS = [ + ("01", "01 (Jan)"), + ("02", "02 (Feb)"), + ("03", "03 (Mar)"), + ("04", "04 (Apr)"), + ("05", "05 (May)"), + ("06", "06 (Jun)"), + ("07", "07 (Jul)"), + ("08", "08 (Aug)"), + ("09", "09 (Sep)"), + ("10", "10 (Oct)"), + ("11", "11 (Nov)"), + ("12", "12 (Dec)"), +] diff --git a/browse/controllers/archive_page/by_month_form.py b/browse/controllers/archive_page/by_month_form.py new file mode 100644 index 000000000..ddad072a4 --- /dev/null +++ b/browse/controllers/archive_page/by_month_form.py @@ -0,0 +1,49 @@ +"""Form for month selection of list controller.""" +from typing import List, Any, Dict + +from flask_wtf import FlaskForm +from wtforms import SelectField, SubmitField, HiddenField +from wtforms.validators import DataRequired + +MONTHS = [ + ('all', 'all months'), + ('01', '01 (Jan)'), + ('02', '02 (Feb)'), + ('03', '03 (Mar)'), + ('04', '04 (Apr)'), + ('05', '05 (May)'), + ('06', '06 (Jun)'), + ('07', '07 (Jul)'), + ('08', '08 (Aug)'), + ('09', '09 (Sep)'), + ('10', '10 (Oct)'), + ('11', '11 (Nov)'), + ('12', '12 (Dec)'), +] + + +class ByMonthForm(FlaskForm): + """Form for browse by month input on archive pages. + + This doesn't try to account for the start date of the + archive, end date of the archive or dates in the future. + It just accepts these, and expects the /list controller + to deal with dates for which there are no articles. + """ + + year = SelectField('year', + validators=[DataRequired()], + choices=[]) + month = SelectField('month', + validators=[DataRequired()], + choices=MONTHS) + archive = HiddenField('archive', validators=[DataRequired()]) + submit = SubmitField('Go') + + def __init__(self, + archive_id: str, + archive: Dict[str, Any], + years: List[int]): + super(ByMonthForm, self).__init__() + self.year.choices = [(str(ye)[-2:], str(ye)) for ye in years] + self.archive.data = archive_id diff --git a/browse/controllers/cookies.py b/browse/controllers/cookies.py new file mode 100644 index 000000000..5c0d39a2b --- /dev/null +++ b/browse/controllers/cookies.py @@ -0,0 +1,117 @@ +"""Handle requests to set cookies""" + +import re +from typing import Any, Dict, List, Optional, Tuple +import copy + +import flask +from flask import url_for, request, make_response +from werkzeug.exceptions import InternalServerError + +from arxiv import status + +# Taken from legacy /users/e-prints/httpd/bin/Databases/mirrors +mirrors = [ + 'de.arxiv.org', + 'es.arxiv.org', + 'in.arxiv.org', + 'cn.arxiv.org', + 'lanl.arxiv.org', + 'vanguard.math.ucdavis.edu:81', +] + +mirror_config = { + 'id': 'mirror', + 'name': 'xxx-mirror', + 'label': 'Select download site:', + 'options': [['default', 'always local site (default)', 1]] +} + +for mirror in mirrors: + mirror_config['options'].append([mirror, mirror, 0]) # type: ignore + +cookies_config = [ + {'id': 'ps', + 'name': 'xxx-ps-defaults', + 'label': 'Select preferred download format:', + 'options': [ + ['default', 'PostScript (600 dpi), PDF (default)', 1], + ['dpi=300%26font=bitmapped', 'PostScript (300 dpi)', 0], + ['fname=cm%26font=TypeI', 'PostScript (Type I cm)', 0], + ['pdf', 'PDF', 0], + ['dvi', 'DVI', 0], + ['src', 'Source', 0] + ], + }, + mirror_config, + {'id': 'mj', + 'name': 'arxiv_mathjax', + 'label': 'Select MathJax configuration: ', + 'options': [['enabled', 'enabled', 1], + ['disabled', 'disabled', 0]] + } +] + + +# TODO implement debug parameter + +def get_cookies_page(is_debug: bool) -> Any: + """Render the cookies page. + + Parameters + ---------- + + Returns + ------- + dict + Search result response data. + int + HTTP status code. + dict + Headers to add to the response. + + Raises + ------ + :class:`.InternalServerError` + Raised when there was an unexpected problem executing the query. + """ + debug = {'debug': '1'} if is_debug else { + } # want to propogate debug to form URL + response_data = { + 'form_url': url_for('browse.cookies', set='set', **debug), + # Note deep copy + 'cookies_config': selected_options_from_request(copy.deepcopy(cookies_config)), + 'debug': is_debug, + 'controlled_cookies': [cc['name'] for cc in cookies_config], + } + response_headers = {'Expires': '0', + 'Pragma': 'no-cache'} + return response_data, status.HTTP_200_OK, response_headers + + +def selected_options_from_request(configs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Sets the selected value on the options for the request cookies.""" + cookies = request.cookies + for cc in configs: + request_value = cookies.get(cc['name'], None) + matching_opt = next((opt for opt in cc['options'] + if opt[0] == request_value), None) + if(matching_opt is not None): + matching_opt[2] = 1 + return configs + + +def cookies_to_set(request: flask.Request) -> List[Dict[str, object]]: + """Get cookies from the form and return them as a list of tuples.""" + cts = [] + for (id, value) in request.form.items(): + matching_conf = next( + (conf for conf in cookies_config if conf['id'] == id), None) + if matching_conf is not None: + ctoset = {'key': matching_conf['name']} + cts.append(ctoset) + if value is None or value == '' or value == 'default': + ctoset['max_age'] = 0 + else: + ctoset['value'] = value + return cts diff --git a/browse/controllers/home_page/__init__.py b/browse/controllers/home_page/__init__.py new file mode 100644 index 000000000..11dc14f84 --- /dev/null +++ b/browse/controllers/home_page/__init__.py @@ -0,0 +1,62 @@ +"""Handle requests to support the home page.""" + +import os +import re +from flask import current_app +from typing import Any, Dict, Optional, Tuple +from werkzeug.exceptions import InternalServerError + +from browse.services.database import get_document_count +from arxiv import status, taxonomy +from arxiv.base import logging +from arxiv.base.globals import get_application_config + +app_config = get_application_config() +logger = logging.getLogger(__name__) + +Response = Tuple[Dict[str, Any], int, Dict[str, Any]] + +RE_TOTAL_PAPERS = re.compile(r'^total_papers\s+(?P[0-9]+)', + re.MULTILINE) + + +def get_home_page() -> Response: + """Get the data needed to generated the home page.""" + response_data: Dict[str, Any] = {} + response_headers: Dict[str, Any] = {} + try: + response_data['document_count'] = _get_document_count() + except Exception as ex: + logger.warning(f'Could not get abs page data: {ex}') + raise InternalServerError from ex + + response_data['groups'] = taxonomy.definitions.GROUPS + response_data['archives'] = taxonomy.definitions.ARCHIVES_ACTIVE + response_data['categories'] = taxonomy.definitions.CATEGORIES_ACTIVE + + return response_data, status.HTTP_200_OK, response_headers + + +def _get_document_count() -> Optional[int]: + + try: + # check DB for document count first + return get_document_count() # type: ignore + except Exception as ex: + logger.warning(f'Error getting document count from DB: {ex}') + + try: + # if DB is unavailable, fall back to legacy static file method + daily_stats_path = current_app.config['BROWSE_DAILY_STATS_PATH'] + if daily_stats_path and os.path.isfile(daily_stats_path): + with open(daily_stats_path, mode='r') as statsf: + stats = statsf.read() + stats_match = RE_TOTAL_PAPERS.match(stats) + if stats_match: + return int(stats_match.group('count')) + else: + raise FileNotFoundError + except (KeyError, FileNotFoundError): + logger.warning(f'Daily stats file not found') + + return None diff --git a/browse/controllers/list_page/__init__.py b/browse/controllers/list_page/__init__.py new file mode 100644 index 000000000..42f8b14fa --- /dev/null +++ b/browse/controllers/list_page/__init__.py @@ -0,0 +1,453 @@ +"""Handle requests for the /list pages. + +/list requests will show a list of articles for a category for a given +time period. + +The primary entrypoint to this module is :func:`.get_list_page`, which +handles GET and POST requests to the list endpoint. + +This should handle requests like: +/list/$category/YYMM +/list/$category/YYMM +/list/category/recent +/list/category/YYMM?skip=n&show=n +/list/archive/new|recent|pastweek +/list/archive/YY +/list/$category/YY + +Examples of odd requests to throw out: +/list/?400 +/list/cs/14?skip=%25CRAZYSTUFF +/list/1801.00023 + +1. Figure out what category and time_period is being requested. It's +either a POST or GET with params about what to get OR it's all in the +path. + +Things to figure out: +A: what subject category is being requested +B: time period aka listing_type: 'pastweek' 'new' 'current' 'pastyear' +C: show_abstracts only if listing_type='new' + +2. Query the listing service for that category and time_period + +3. Check for not modified. + +4. Disply the page + +Differences from legacy arxiv: +Doesn't handle the /view path. +""" +import calendar +import logging +import math +from typing import Any, Dict, List, Optional, Tuple, Union + +from arxiv import status, taxonomy +from flask import current_app, request, url_for +from werkzeug.exceptions import ServiceUnavailable, BadRequest + +from browse.controllers.abs_page import truncate_author_list_size +from browse.controllers.list_page.paging import paging +from browse.domain.metadata import DocMetadata +from browse.services.document import metadata +from browse.services.listing import ListingService, get_listing_service +from browse.domain.listing import NewResponse, NotModifiedResponse, ListingResponse +from browse.services.search.search_authors import queries_for_authors, \ + split_long_author_list, AuthorList + + +logger = logging.getLogger(__name__) + +show_values = [5, 10, 25, 50, 100, 250, 500, 1000, 2000] +"""" Values of $show for more/fewer/all.""" + +max_show = show_values[-1] +"""Max value for show that controller respects.""" + +default_show = show_values[2] +"""Default value for show.""" + +Response = Tuple[Dict[str, Any], int, Dict[str, Any]] + +type_to_template = { + 'new': 'list/new.html', + 'recent': 'list/recent.html', + 'current': 'list/month.html', + 'month': 'list/month.html', + 'year': 'list/year.html' +} + +def get_listing(subject_or_category: str, + time_period: str, + skip: str = '', + show: str = '') -> Response: + """ + Handle requests to list articles. + + Parameters + ---------- + subject_or_category + Subject or categtory to get listing for. + time_period + YY or YYMM or 'recent' or 'pastweek' or 'new' or 'current'. + recent and pastweek mean the last 5 listings, + new means the most recent listing, + current means the listings for the current month. + skip + Number of articles to skip for this subject and time_period. + show + Number of articles to show + """ + # TODO make sure to handle POST too + skip = skip or request.args.get('skip', None) + show = show or request.args.get('show', None) + if request.args.get('archive', None) is not None: + subject_or_category = request.args.get('archive') # type: ignore + if request.args.get('year', None): + time_period = request.args.get('year') # type: ignore + month = request.args.get('month', None) + if month and month != 'all': + time_period = time_period + request.args.get('month') # type: ignore + + if (not subject_or_category or + not (time_period and + (time_period.isdigit() or + time_period in ['new', 'current', 'pastweek', 'recent']))): + raise BadRequest + + if subject_or_category in taxonomy.CATEGORIES: + list_type = 'category' + list_ctx_name = taxonomy.CATEGORIES[subject_or_category]['name'] + list_ctx_id = subject_or_category + list_ctx_in_archive = taxonomy.CATEGORIES[subject_or_category]['in_archive'] + elif subject_or_category in taxonomy.ARCHIVES: + list_type = 'archive' + list_ctx_id = subject_or_category + list_ctx_name = taxonomy.ARCHIVES[subject_or_category]['name'] + list_ctx_in_archive = list_ctx_name + else: + raise BadRequest + + listing_service = get_listing_service() + if not listing_service: + raise ServiceUnavailable + + if not skip or not skip.isdigit(): + skipn = 0 + else: + skipn = int(skip) + + if not show or not show.isdigit(): + if time_period == 'new': + shown = max_show + else: + shown = default_show + else: + shown = min(int(show), max_show) + + if_mod_since = request.headers.get('If-Modified-Since', None) + + response_data: Dict[str, Any] = {} + response_headers: Dict[str, Any] = {} + + if time_period == 'new': + list_type = 'new' + new_resp = listing_service.list_new_articles( + subject_or_category, skipn, shown, if_mod_since) + response_headers.update(_expires_headers(new_resp)) + if _not_modified(new_resp): + return {}, status.HTTP_304_NOT_MODIFIED, response_headers + listings = new_resp['listings'] + count = new_resp['new_count'] + \ + new_resp['rep_count'] + new_resp['cross_count'] + response_data['announced'] = new_resp['announced'] + response_data['submitted'] = new_resp['submitted'] + response_data.update( + index_for_types(new_resp, subject_or_category, time_period, skipn, shown)) + response_data.update(sub_sections_for_types(new_resp, skipn, shown)) + + elif time_period in ['pastweek', 'recent']: + list_type = 'recent' + rec_resp = listing_service.list_pastweek_articles( + subject_or_category, skipn, shown, if_mod_since) + response_headers.update(_expires_headers(rec_resp)) + if _not_modified(rec_resp): + return {}, status.HTTP_304_NOT_MODIFIED, response_headers + listings = rec_resp['listings'] + count = rec_resp['count'] + response_data['pubdates'] = rec_resp['pubdates'] + + elif time_period == 'current': + list_type = 'current' + cur_resp = listing_service.list_articles_by_month( + subject_or_category, 1999, 12, skipn, shown, if_mod_since) + response_headers.update(_expires_headers(cur_resp)) + if _not_modified(cur_resp): + return {}, status.HTTP_304_NOT_MODIFIED, response_headers + listings = cur_resp['listings'] + count = cur_resp['count'] + response_data['pubmonth'] = cur_resp['pubdates'][0][0] + + else: # YYMM or YYYYMM? + yandm = year_month(time_period) + if yandm is None: + raise BadRequest + list_year, list_month = yandm + response_data['list_time'] = time_period + response_data['list_year'] = str(list_year) + if list_month or list_month == 0: + if list_month < 1 or list_month > 12: + raise BadRequest + list_type = 'month' + response_data['list_month'] = str(list_month) + response_data['list_month_name'] = calendar.month_abbr[list_month] + month_reps = listing_service.list_articles_by_month( + subject_or_category, list_year, list_month, skipn, shown, if_mod_since) + response_headers.update(_expires_headers(month_reps)) + if _not_modified(month_reps): + return {}, status.HTTP_304_NOT_MODIFIED, response_headers + listings = month_reps['listings'] + count = month_reps['count'] + response_data['pubmonth'] = month_reps['pubdates'][0][0] + else: + list_type = 'year' + year_resp = listing_service.list_articles_by_year( + subject_or_category, list_year, skipn, shown, if_mod_since) + response_headers.update(_expires_headers(year_resp)) + if _not_modified(year_resp): + return {}, status.HTTP_304_NOT_MODIFIED, response_headers + listings = year_resp['listings'] + count = year_resp['count'] + response_data['pubmonth'] = year_resp['pubdates'][0][0] + + # TODO if it is a HEAD, and nothing has changed, send not modified + + idx = 0 + + for item in listings: + idx = idx + 1 + item['article'] = metadata.get_abs(item['id']) # type: ignore + item['list_index'] = idx + skipn # type: ignore + + response_data['listings'] = listings + response_data['author_links'] = authors_for_articles(listings) + response_data['downloads'] = dl_for_articles(listings) + + response_data.update({ + 'context': subject_or_category, + 'count': count, + 'subcontext': time_period, + 'shown': shown, + 'skipn': skipn, + 'list_type': list_type, + 'list_ctx_name': list_ctx_name, + 'list_ctx_id': list_ctx_id, + 'list_ctx_in_archive': list_ctx_in_archive, + 'paging': paging(count, skipn, shown, + subject_or_category, time_period), + 'viewing_all': shown >= count, + 'template': type_to_template[list_type] + }) + + response_data.update(more_fewer(shown, count, shown >= count)) + + def author_query(article: DocMetadata, query: str)->str: + return str(url_for('search_archive', + searchtype='author', + archive=article.primary_archive.id, + query=query)) + response_data['url_for_author_search'] = author_query + + return response_data, status.HTTP_200_OK, response_headers + + + + +def year_month(tp: str)->Optional[Tuple[int, Optional[int]]]: + """Gets the year and month from the time_period parameter.""" + if not tp or len(tp) > 6 or len(tp) < 2: + return None + + if len(tp) == 2: # 2dig year + return int(tp), None + + if len(tp) == 4: # 2 dig year, 2 dig month + mm_part = int(tp[2:4]) + + yy_part = int(tp[:2]) + if yy_part >= 91 and yy_part <= 99: + return (1900 + yy_part, mm_part) + else: + return (2000 + yy_part, mm_part) + + if len(tp) == 4+2: # wow, 4 digit year! + return int(tp[0:4]), int(tp[4:]) + else: + return None + + +def more_fewer(show: int, count: int, viewing_all: bool) -> Dict[str, Any]: + """Links for the more/fewer sections. + + We want first show_values[n] where show_values[n] < show and + show_values[n+1] > show + """ + nplus1s = show_values[1:] + n_n1_tups = map(lambda n, n1: (n, n1), show_values, nplus1s) + tup_f = filter(lambda nt: nt[0] < show and nt[1] >= show, n_n1_tups) + rd = {'mf_fewer': next(tup_f, (None, None))[0]} + + if not viewing_all and count < max_show and show < max_show: + rd['mf_all'] = count + + # python lacks a find(labmda x:...) ? + rd['mf_more'] = next( + filter(lambda x: x > show and x < count, show_values), None) # type: ignore + + return rd + + +def dl_for_articles(items: List[Any])->Dict[str, Any]: + """Gets the download links for an article.""" + dl_pref = request.cookies.get('xxx-ps-defaults') + return {item['article'].arxiv_id_v: metadata.get_dissemination_formats(item['article'], dl_pref) + for item in items} + + +def authors_for_articles(listings: List[Any])->Dict[str, Any]: + """Returns a Dict of article id to author links.""" + return {item['article'].arxiv_id_v: author_links(item['article']) for item in listings} + + +def author_links(abs_meta: DocMetadata) -> Tuple[AuthorList, AuthorList, int]: + """Creates author list links in a very similar way to abs page.""" + return split_long_author_list(queries_for_authors(abs_meta.authors.raw), + truncate_author_list_size) + + +def index_for_types(resp: NewResponse, + context: str, subcontext: str, + skipn: int, shown: int) ->Dict[str, Any]: + """Creates index for types of new papers in a NewResponse.""" + ift = [] + new_count = resp['new_count'] + cross_count = resp['cross_count'] + rep_count = resp['rep_count'] + + if new_count > 0: + if skipn != 0: + ift.append(('New submissions', + url_for('.list_articles', + context=context, subcontext=subcontext, + skip=0, show=shown), + 0)) + else: + ift.append(('New submissions', '', 0)) + + if cross_count > 0: + cross_index = new_count + 1 + c_skip = math.floor(new_count / shown) * shown + + if new_count > shown: + ift.append(('Cross-lists', + url_for('.list_articles', + context=context, subcontext=subcontext, + skip=c_skip, show=shown), + cross_index)) + else: + ift.append(('Cross-lists', '', cross_index)) + + if rep_count > 0: + rep_index = new_count+cross_count + 1 + rep_skip = math.floor((new_count + cross_count)/shown) * shown + if new_count + cross_count > shown: + ift.append(('Replacements', + url_for('.list_articles', + context=context, subcontext=subcontext, + skip=rep_skip, show=shown), + rep_index)) + else: + ift.append(('Replacements', '', rep_index)) + + return {'index_for_types': ift} + + +def sub_sections_for_types( + resp: NewResponse, + skipn: int, shown: int) -> Dict[str, Any]: + """Creates data used in section headings on /list/ARCHIVE/new.""" + secs = [] + new_count = resp['new_count'] + cross_count = resp['cross_count'] + rep_count = resp['rep_count'] + + news = [item for item in resp['listings'] if item['listingType'] == 'new'] + crosses = [item for item in resp['listings'] + if item['listingType'] == 'cross'] + reps = [item for item in resp['listings'] if item['listingType'] == 'rep'] + + cross_start = new_count+1 + rep_start = new_count + cross_count + 1 + last_shown = skipn + shown + + if news: + secs.append({ + 'type': 'new', + 'items': news, + 'total': new_count, + 'continued': skipn > 0, + 'last': skipn >= new_count - shown + }) + # else already skipped past new section + + if crosses: + secs.append({ + 'type': 'cross', + 'items': crosses, + 'total': cross_count, + 'continued': skipn + 1 > cross_start, + 'last': skipn >= rep_start - shown + }) + # else skipped past cross section + + if reps: + secs.append({ + 'type': 'rep', + 'items': reps, + 'total': rep_count, + 'continued': skipn + 1 > rep_start, + 'last': last_shown >= new_count + cross_count + rep_count + }) + + for sec in secs: + typ = {'new': 'New', 'cross': 'Cross', 'rep': 'Replacement'}[ # type: ignore + sec['type']] + date = resp['announced'].strftime('%A, %-d %B %Y') + + showing = 'showing ' + if sec['continued']: + showing = 'continued, ' + showing + if sec['last']: + showing = showing + 'last ' + if not sec['last'] and not sec['continued']: + showing = showing + 'first ' + + n = len(sec['items']) # type: ignore + tot = sec['total'] + sec['heading'] = f'{typ} submissions for {date} ({showing}{n} of {tot} entries )' + + return {'sub_sections_for_types': secs} + + +def _not_modified(response: Union[ListingResponse, NewResponse, NotModifiedResponse]) -> bool: + return bool(response and response.get('not_modified', False)) + + +def _expires_headers(listing_resp: + Union[ListingResponse, NewResponse, NotModifiedResponse]) \ + -> Dict[str, str]: + if listing_resp and listing_resp.get('expires', False): + return {'Expires': str(listing_resp['expires'])} + else: + return {} diff --git a/browse/controllers/list_page/paging.py b/browse/controllers/list_page/paging.py new file mode 100644 index 000000000..54f97a59e --- /dev/null +++ b/browse/controllers/list_page/paging.py @@ -0,0 +1,71 @@ +"""Paginiation links for listing pages.""" + + +from typing import Any, Dict, List, Union +import math +from flask import url_for + + +def paging(count: int, skipn: int, shown: int, context: str, subcontext: str) \ + -> List[Dict[str, Union[str, int]]]: + """Get paging links.""" + bumper_pages = 3 # num of buffer pages on each side of current + total_pages = math.floor(count-1 / (skipn+1))+1 # total number of pages + + slots_in_paging = 2 * bumper_pages + 5 + # Maximum number of slots for elements in the pages sections: + # 2*bumper_pages + start + end + 2*dots + current + + def page_dict(n: int, nolink: bool = False) -> Dict[str, Union[str, int]]: + txt = f'{n + 1}-{min(count, n + shown)}' + if nolink: + return {'nolink': txt} + else: + return {'skip': n, + 'txt': txt, + 'url': url_for('.list_articles', + context=context, + subcontext=subcontext, + skip=n, + show=shown)} + + page_starts = range(0, count, shown) # Paper indexs for each page start + + if total_pages < slots_in_paging: + # just show all numbers number of pages is less than slots + return [page_dict(n) for n in page_starts if n < skipn] + \ + [{'nolink': skipn}] + \ + [page_dict(n) for n in page_starts if n > skipn] + + page_links: List[Dict[str, Any]] = [] + if skipn >= shown: # Not on first page? + page_links = [page_dict(0)] + + prebumper = [n for n in page_starts if n >= ( + skipn - shown * bumper_pages) and n < skipn and n > 0] + + if prebumper: + if prebumper[0] <= shown * bumper_pages: + # Case of no dots between first and prebumper + page_links = page_links + \ + [page_dict(n) for n in prebumper] + else: + page_links.append({'nolink': '...'}) + page_links = page_links + \ + [page_dict(n) for n in prebumper] + + page_links.append(page_dict(skipn, True)) # non-link for current page + + postbumper = [n for n in page_starts if n > skipn and n <= + (skipn + shown * bumper_pages)] + if postbumper: + page_links = page_links + \ + [page_dict(n) for n in postbumper] + if postbumper[-1] < page_starts[-1]: + # Case of need dots between postbumper and last + page_links.append({'nolink': '...'}) + + if postbumper and postbumper[-1] < page_starts[-1]: + page_links.append(page_dict(page_starts[-1])) # last + + return page_links diff --git a/browse/controllers/prevnext/__init__.py b/browse/controllers/prevnext/__init__.py new file mode 100644 index 000000000..73c1c8f16 --- /dev/null +++ b/browse/controllers/prevnext/__init__.py @@ -0,0 +1,85 @@ +"""Handle requests to support sequential navigation between arXiv IDs.""" + +from flask import url_for +from typing import Tuple, Dict, Any +from werkzeug import MultiDict +from werkzeug.exceptions import InternalServerError, BadRequest + +from browse.domain.identifier import Identifier, IdentifierException +from browse.services.database import get_sequential_id +from arxiv import status +from arxiv.taxonomy.definitions import ARCHIVES, CATEGORIES_ACTIVE +from arxiv.base import logging + + +Response = Tuple[Dict[str, Any], int, Dict[str, Any]] +logger = logging.getLogger(__name__) + + +def get_prevnext(request_params: MultiDict) -> Response: + """ + Get the next or previous arXiv ID in the browse context. + + The 'id', 'function', and 'context' request parameters are required. The + 'site' parameter from the classic prevnext is no longer supported. + + Parameters + ---------- + request_params : dict + + Returns + ------- + dict + Search result response data. + int + HTTP status code. + dict + Headers to add to the response. + + Raises + ------ + InternalServerError + Raised when there was an unexpected problem executing the query. + BadRequest + Raised when request parameters are missing, invalid, or when an ID + redirect cannot be returned even when the request parameters are valid. + + """ + if 'id' not in request_params: + raise BadRequest('Missing article identifier') + try: + arxiv_id = Identifier(request_params['id']) + except IdentifierException: + raise BadRequest(f"Invalid article identifier {request_params['id']}") + + if not ('function' in request_params + and request_params['function'] in ['prev', 'next']): + raise BadRequest('Missing or invalid function request') + + if 'context' not in request_params: + raise BadRequest('Missing context') + context = request_params['context'] + + if not (context in CATEGORIES_ACTIVE + or context in ARCHIVES or context == 'all'): + raise BadRequest('Invalid context') + + is_next = request_params['function'] == 'next' + try: + seq_id = get_sequential_id(paper_id=arxiv_id, + is_next=is_next, + context=context) + except Exception as ex: + logger.warning(f'Error getting sequential ID: {ex}') + raise InternalServerError from ex + + if not seq_id: + raise BadRequest( + f'No {"next" if is_next else "previous"} article found for ' + f'{arxiv_id.id} in {context}' + ) + + redirect_url = url_for('browse.abstract', + arxiv_id=seq_id, + context=context) + return {}, status.HTTP_301_MOVED_PERMANENTLY, {'Location': redirect_url} diff --git a/browse/controllers/prevnext/tests.py b/browse/controllers/prevnext/tests.py new file mode 100644 index 000000000..c79e37f0f --- /dev/null +++ b/browse/controllers/prevnext/tests.py @@ -0,0 +1,103 @@ +"""Tests for prevnext controller, :mod:`browse.controllers.prevnext`.""" + +from unittest import TestCase, mock +from werkzeug import MultiDict +from werkzeug.exceptions import BadRequest +from browse.controllers import prevnext + + +class TestPrevNextController(TestCase): + """Tests for :func:`.get_prevnext`.""" + + def test_missing_parameters(self) -> None: + """Test request with missing parameters.""" + request_data = MultiDict() + with self.assertRaises(BadRequest): + prevnext.get_prevnext(request_data) + + request_data = MultiDict({ + 'id': '1801.00001' + }) + with self.assertRaises(BadRequest): + prevnext.get_prevnext(request_data) + + request_data = MultiDict({ + 'id': '1801.00001', + 'function': 'next' + }) + with self.assertRaises(BadRequest): + prevnext.get_prevnext(request_data) + + request_data = MultiDict({ + 'id': '1801.00001', + 'context': 'cs' + }) + with self.assertRaises(BadRequest): + prevnext.get_prevnext(request_data) + + request_data = MultiDict({ + 'function': 'prev', + 'context': 'cs' + }) + with self.assertRaises(BadRequest): + prevnext.get_prevnext(request_data) + + def test_bad_parameters(self) -> None: + """Test parameters with bad values.""" + request_data = MultiDict({ + 'id': 'foo', # invalid + 'function': 'prev', # valid + 'context': 'cs.AI' # valid + }) + with self.assertRaises(BadRequest): + prevnext.get_prevnext(request_data) + + request_data = MultiDict({ + 'id': 'cs/0001001', # valid + 'function': 'bar', # invalid + 'context': 'cs' # valid + }) + with self.assertRaises(BadRequest): + prevnext.get_prevnext(request_data) + + request_data = MultiDict({ + 'id': 'cs/0001001', # valid + 'function': 'next', # valid + 'context': 'baz' # invalid + }) + with self.assertRaises(BadRequest): + prevnext.get_prevnext(request_data) + + @mock.patch('browse.controllers.prevnext.get_sequential_id') + @mock.patch('browse.controllers.prevnext.url_for') + def test_good_parameters(self, mock_url_for, mock_get_sequential_id) -> None: # type: ignore + """Test parameters with good values.""" + request_data = MultiDict({ + 'id': '1801.00001', + 'function': 'next', + 'context': 'all' + }) + mock_get_sequential_id.return_value = '1801.00002' + _, status, headers = prevnext.get_prevnext(request_data) + self.assertEqual(status, 301) + + request_data = MultiDict({ + 'id': '1801.00002', + 'function': 'prev', + 'context': 'cs.AI' + }) + mock_get_sequential_id.return_value = '1801.00001' + _, status, headers = prevnext.get_prevnext(request_data) + self.assertEqual(status, 301) + + request_data = MultiDict({ + 'id': '1701.00002', + 'function': 'next', + 'context': 'physics.gen-ph' + }) + mock_get_sequential_id.return_value = None + with self.assertRaises(BadRequest): + prevnext.get_prevnext(request_data) + mock_get_sequential_id.return_value = '' + with self.assertRaises(BadRequest): + prevnext.get_prevnext(request_data) diff --git a/browse/controllers/stats_page/__init__.py b/browse/controllers/stats_page/__init__.py new file mode 100644 index 000000000..8f1ec6fd8 --- /dev/null +++ b/browse/controllers/stats_page/__init__.py @@ -0,0 +1,151 @@ +"""Handle requests to display and return stats about the arXiv service.""" + +import dateutil.parser +from datetime import datetime, timedelta +from typing import Any, Dict, Optional, Tuple +from werkzeug.exceptions import InternalServerError, BadRequest + +from arxiv import status +from arxiv.base import logging +from browse.services.database import get_hourly_stats, get_hourly_stats_count, \ + get_monthly_download_stats, get_monthly_submission_stats, \ + get_monthly_submission_count, get_monthly_download_count, \ + get_max_download_stats_dt +from browse.services.document.config.deleted_papers import DELETED_PAPERS + + +Response = Tuple[Dict[str, Any], int, Dict[str, Any]] +logger = logging.getLogger(__name__) + + +def get_hourly_stats_page(requested_date_str: Optional[str] = None) -> Response: + """Get data for the /stats/today page.""" + response_data: Dict[str, Any] = {} + current_dt = datetime.now() + requested_dt = current_dt - timedelta(hours=1) + response_data['current_dt'] = current_dt + response_data['requested_dt'] = requested_dt + + if requested_date_str: + try: + requested_dt = dateutil.parser.parse(requested_date_str) + response_data['requested_dt'] = requested_dt + except (TypeError, ValueError): + raise BadRequest + + try: + normal_count, admin_count, num_nodes = \ + get_hourly_stats_count(stats_date=requested_dt.date()) + except Exception as ex: + logger.warning(f'Error getting hourly stats page data: {ex}') + raise InternalServerError from ex + + response_data['normal_count'] = normal_count + response_data['admin_count'] = admin_count + response_data['num_nodes'] = num_nodes + return response_data, status.HTTP_200_OK, {} + + +def get_hourly_stats_csv(requested_date_str: Optional[str] = None) -> Response: + """Get the hourly stats in CSV format.""" + hourly_stats: dict = {} + max_node = 1 + + requested_dt = datetime.now() - timedelta(hours=1) + if requested_date_str: + try: + requested_dt = dateutil.parser.parse(requested_date_str) + except (TypeError, ValueError): + raise BadRequest + try: + rows = get_hourly_stats(stats_date=requested_dt.date()) + except Exception as ex: + logger.warning(f'Error getting hourly stats csv: {ex}') + raise InternalServerError from ex + + for r in rows: + hour_dt: str = datetime( + r.ymd.year, r.ymd.month, r.ymd.day, + hour=r.hour).strftime('%Y-%m-%dT%H:%M:%SZ') + if hour_dt not in hourly_stats: + hourly_stats[hour_dt] = {} + hourly_stats[hour_dt][r.node_num] = r.connections + if r.node_num > max_node: + max_node = r.node_num + csv_head = 'hour' + \ + "".join(f",node{i}" for i in range(1, max_node + 1)) + "\n" + csv_data = "" + for hour in sorted(hourly_stats): + csv_data = csv_data + hour + for node in range(1, max_node + 1): + count = hourly_stats[hour][node] \ + if node in hourly_stats[hour] else 0 + csv_data = csv_data + f",{count}" + csv_data = csv_data + "\n" + return {'csv': csv_head + csv_data}, status.HTTP_200_OK, {'Content-Type': 'text/csv'} + + +def get_monthly_downloads_page() -> Response: + """Get the data from the monthly downloads page.""" + response_data: Dict[str, Any] = {} + try: + response_data['total_downloads'] = get_monthly_download_count() + response_data['most_recent_dt'] = get_max_download_stats_dt() + return response_data, status.HTTP_200_OK, {} + except Exception as ex: + logger.warning(f'Error getting monthly downloads page data: {ex}') + raise InternalServerError from ex + + +def get_download_stats_csv() -> Response: + """Get download stats in CSV format.""" + csv_head = "month,downloads\n" + try: + csv_data = "".join([ + f"{r.ym.strftime('%Y-%m')},{r.downloads}\n" + for r in get_monthly_download_stats() + ]) + return {'csv': csv_head + csv_data}, status.HTTP_200_OK, {'Content-Type': 'text/csv'} + except Exception as ex: + logger.warning(f'Error getting monthly download stats csv: {ex}') + raise InternalServerError from ex + + +def get_monthly_submissions_page() -> Response: + """Get the data from the monthly submissions page.""" + response_data: Dict[str, Any] = {} + current_dt = datetime.now() + arxiv_start_dt = datetime(year=1991, month=8, day=1) + arxiv_age = current_dt - arxiv_start_dt + num_deleted = len(DELETED_PAPERS) + try: + num_submissions, historical_delta = \ + get_monthly_submission_count() + except Exception as ex: + logger.warning(f'Error getting monthly submissions stats data: {ex}') + raise InternalServerError + + num_migrated = abs(historical_delta) + response_data['current_dt'] = current_dt + response_data['arxiv_age_years'] = arxiv_age.days / 365 + response_data['arxiv_start_dt'] = arxiv_start_dt + response_data['num_migrated'] = num_migrated + response_data['num_deleted'] = num_deleted + response_data['num_submissions'] = num_submissions + response_data['num_submissions_adjusted'] = \ + num_submissions - num_deleted + num_migrated + return response_data, status.HTTP_200_OK, {} + + +def get_submission_stats_csv() -> Response: + """Get submission stats in CSV format.""" + csv_head = "month,submissions,historical_delta\n" + try: + csv_data = "".join([ + f"{r.ym.strftime('%Y-%m')},{r.num_submissions},{r.historical_delta}\n" + for r in get_monthly_submission_stats() + ]) + return {'csv': csv_head + csv_data}, status.HTTP_200_OK, {'Content-Type': 'text/csv'} + except Exception as ex: + logger.warning(f'Error getting monthly submission stats csv: {ex}') + raise InternalServerError from ex diff --git a/browse/controllers/stats_page/tests.py b/browse/controllers/stats_page/tests.py new file mode 100644 index 000000000..8575e8120 --- /dev/null +++ b/browse/controllers/stats_page/tests.py @@ -0,0 +1,204 @@ +"""Tests for stats page controllers, :mod:`browse.controllers.stats_page`.""" + +from unittest import TestCase, mock +from datetime import date, datetime +from werkzeug.exceptions import BadRequest + +from arxiv import status +from browse.controllers import stats_page + + +class TestStatsPageControllers(TestCase): + """Tests for :mod:`browse.controllers.stats_page` controllers.""" + + @mock.patch('browse.controllers.stats_page.get_hourly_stats_count') + def test_get_hourly_stats_page(self, mock_get_hourly_stats_count) -> None: # type: ignore + """Tests for :func:`.get_hourly_stats_page`.""" + # test bad requested_date_str + with self.assertRaises(BadRequest): + stats_page.get_hourly_stats_page(requested_date_str='foo') + with self.assertRaises(BadRequest): + stats_page.get_hourly_stats_page(requested_date_str='201901') + + # test response for good or no date option + mock_get_hourly_stats_count.return_value = (0, 0, 0) + for date_str in ['2019', '2019-01-01', '20180202', None]: + response_data, code, _ = stats_page.get_hourly_stats_page( + requested_date_str=date_str) + mock_get_hourly_stats_count.assert_called_once() + mock_get_hourly_stats_count.reset_mock() + self.assertEqual(code, status.HTTP_200_OK, + 'Response should be OK.') + for key in ['current_dt', 'requested_dt', + 'normal_count', 'admin_count']: + self.assertIn(key, response_data, f'{key} is in response_data') + + @mock.patch('browse.controllers.stats_page.get_hourly_stats') + def test_get_hourly_stats_csv(self, mock_get_hourly_stats) -> None: # type: ignore + """Tests for :func:`.get_hourly_stats_csv`.""" + # test bad requested_date_str + with self.assertRaises(BadRequest): + stats_page.get_hourly_stats_csv(requested_date_str='bar') + with self.assertRaises(BadRequest): + stats_page.get_hourly_stats_csv(requested_date_str='2017-021') + + # test basic response when no date option is provided + mock_get_hourly_stats.return_value = list() + response_data, code, headers = stats_page.get_hourly_stats_csv() + mock_get_hourly_stats.assert_called_once() + mock_get_hourly_stats.reset_mock() + self.assertEqual(code, status.HTTP_200_OK, 'Response should be OK.') + self.assertEqual(headers['Content-Type'], 'text/csv') + self.assertIn('csv', response_data, 'csv is in response data') + self.assertEqual(response_data['csv'], + "hour,node1\n") + + # test response with mock data, when no date option is provided + test_td = datetime(2019, 3, 19) + mock_get_hourly_stats.return_value = [ + mock.Mock(ymd=test_td, hour=0, node_num=4, + access_type='N', connections=4123), + mock.Mock(ymd=test_td, hour=0, node_num=3, + access_type='N', connections=3124), + mock.Mock(ymd=test_td, hour=0, node_num=2, + access_type='N', connections=2124), + mock.Mock(ymd=test_td, hour=0, node_num=1, + access_type='N', connections=1234), + ] + expected_response = "hour,node1,node2,node3,node4\n"\ + "2019-03-19T00:00:00Z,1234,2124,3124,4123\n" + + # test response with mock data, when date option is provided + response_data, code, headers = stats_page.get_hourly_stats_csv(requested_date_str='2019-03-19') + mock_get_hourly_stats.assert_called_once_with(stats_date=date(2019, 3, 19)) + self.assertEqual(code, status.HTTP_200_OK, 'Response should be OK.') + self.assertEqual(response_data['csv'], expected_response) + + mock_get_hourly_stats.return_value = [ + mock.Mock(ymd=test_td, hour=0, node_num=2, + access_type='N', connections=2120), + mock.Mock(ymd=test_td, hour=0, node_num=4, + access_type='N', connections=4120), + mock.Mock(ymd=test_td, hour=0, node_num=1, + access_type='N', connections=1230), + mock.Mock(ymd=test_td, hour=0, node_num=3, + access_type='N', connections=3120), + mock.Mock(ymd=test_td, hour=1, node_num=1, + access_type='N', connections=1241), + mock.Mock(ymd=test_td, hour=1, node_num=4, + access_type='N', connections=4121), + mock.Mock(ymd=test_td, hour=1, node_num=3, + access_type='N', connections=3231), + ] + expected_response = "hour,node1,node2,node3,node4\n"\ + "2019-03-19T00:00:00Z,1230,2120,3120,4120\n"\ + "2019-03-19T01:00:00Z,1241,0,3231,4121\n" + + response_data, code, headers = stats_page.get_hourly_stats_csv() + self.assertEqual(code, status.HTTP_200_OK, 'Response should be OK.') + self.assertEqual(response_data['csv'], expected_response) + + @mock.patch('browse.controllers.stats_page.get_max_download_stats_dt') + @mock.patch('browse.controllers.stats_page.get_monthly_download_count') + def test_get_monthly_downloads_page(self, # type: ignore + mock_get_monthly_download_count, + mock_get_max_download_stats_dt) -> None: + """Tests for :func:`.get_monthly_downloads_page`.""" + # test basic response + mock_get_monthly_download_count.return_value = 1 + mock_get_max_download_stats_dt.return_value = datetime(2019, 3, 1) + response_data, code, headers = stats_page.get_monthly_downloads_page() + + mock_get_monthly_download_count.assert_called_once() + mock_get_max_download_stats_dt.assert_called_once() + self.assertEqual(code, status.HTTP_200_OK, 'Response should be OK.') + self.assertIn('total_downloads', response_data) + self.assertIn('most_recent_dt', response_data) + + @mock.patch('browse.controllers.stats_page.get_monthly_download_stats') + def test_get_download_stats_csv(self, mock_get_monthly_download_stats) -> None: # type: ignore + """Tests for :func:`.get_monthly_download_stats_csv`.""" + # test basic response + mock_get_monthly_download_stats.return_value = list() + response_data, code, headers = stats_page.get_download_stats_csv() + mock_get_monthly_download_stats.assert_called_once() + mock_get_monthly_download_stats.reset_mock() + self.assertEqual(code, status.HTTP_200_OK, 'Response should be OK.') + self.assertEqual(headers['Content-Type'], 'text/csv') + self.assertIn('csv', response_data, 'csv is in response data') + self.assertEqual(response_data['csv'], + "month,downloads\n") + + # test response with mock data + mock_get_monthly_download_stats.return_value = [ + mock.Mock(ym=datetime(2017, 1, 1), downloads=1234567), + mock.Mock(ym=datetime(2017, 2, 1), downloads=2345678) + ] + expected_response = "month,downloads\n"\ + "2017-01,1234567\n"\ + "2017-02,2345678\n" + response_data, code, headers = stats_page.get_download_stats_csv() + self.assertEqual(code, status.HTTP_200_OK, 'Response should be OK.') + self.assertEqual(response_data['csv'], expected_response) + + @mock.patch('browse.controllers.stats_page.get_monthly_submission_count') + def test_get_monthly_submissions_page(self, mock_get_monthly_submission_count) -> None: # type: ignore + """Tests for :func:`.get_monthly_submissions_page`.""" + # test basic response + mock_get_monthly_submission_count.return_value = (0, 0) + response_data, code, headers = \ + stats_page.get_monthly_submissions_page() + mock_get_monthly_submission_count.assert_called_once() + mock_get_monthly_submission_count.reset_mock() + self.assertEqual(code, status.HTTP_200_OK, 'Response should be OK.') + + for key in ['num_migrated', 'num_deleted', 'num_submissions', + 'current_dt', 'arxiv_start_dt', 'arxiv_age_years', + 'num_submissions_adjusted']: + self.assertIn(key, response_data, f'{key} is in response_data') + self.assertIsNotNone(response_data[key], + f'response_data[{key}] is not None') + + # test response with mock data + mock_get_monthly_submission_count.return_value = (1123456, -501) + response_data, code, headers = \ + stats_page.get_monthly_submissions_page() + mock_get_monthly_submission_count.assert_called_once() + self.assertEqual(code, status.HTTP_200_OK, 'Response should be OK.') + self.assertEqual(response_data['num_migrated'], 501) + self.assertEqual(response_data['num_submissions'], 1123456) + expected_submissions_adjusted = \ + response_data['num_submissions'] + response_data['num_migrated'] - response_data['num_deleted'] + self.assertEqual(response_data['num_submissions_adjusted'], + expected_submissions_adjusted) + self.assertGreaterEqual(response_data['num_deleted'], 155) + self.assertGreater(response_data['arxiv_age_years'], 25, + 'arXiv may rent a car') + self.assertIsInstance(response_data['current_dt'], datetime) + self.assertIsInstance(response_data['arxiv_start_dt'], datetime) + + @mock.patch('browse.controllers.stats_page.get_monthly_submission_stats') + def test_get_submission_stats_csv(self, mock_get_monthly_submission_stats) -> None: # type: ignore + """Tests for :func:`.get_submission_stats_csv`.""" + # test basic response + mock_get_monthly_submission_stats.return_value = list() + response_data, code, headers = stats_page.get_submission_stats_csv() + self.assertEqual(code, status.HTTP_200_OK, 'Response should be OK.') + self.assertEqual(headers['Content-Type'], 'text/csv') + self.assertIn('csv', response_data, 'csv is in response data') + self.assertEqual(response_data['csv'], + "month,submissions,historical_delta\n") + + # test response with mock data + mock_get_monthly_submission_stats.return_value = [ + mock.Mock(ym=datetime(2019, 2, 1), + num_submissions=9999, historical_delta=-42), + mock.Mock(ym=datetime(2019, 3, 1), + num_submissions=10101, historical_delta=0) + ] + expected_response = "month,submissions,historical_delta\n"\ + "2019-02,9999,-42\n"\ + "2019-03,10101,0\n" + response_data, code, headers = stats_page.get_submission_stats_csv() + self.assertEqual(code, status.HTTP_200_OK, 'Response should be OK.') + self.assertEqual(response_data['csv'], expected_response) diff --git a/browse/controllers/tb_page/__init__.py b/browse/controllers/tb_page/__init__.py new file mode 100644 index 000000000..328909dc3 --- /dev/null +++ b/browse/controllers/tb_page/__init__.py @@ -0,0 +1,195 @@ +"""Handle requests to display the trackbacks for arXiv articles.""" + +import re +from typing import Any, Dict, List, Tuple +from werkzeug.exceptions import InternalServerError, BadRequest +from werkzeug.datastructures import MultiDict + +from arxiv import status +from arxiv.base import logging +from arxiv.base.globals import get_application_config +from browse.exceptions import TrackbackNotFound +from browse.services.database import get_paper_trackback_pings, \ + get_recent_trackback_pings, \ + get_trackback_ping +from browse.controllers import check_supplied_identifier +from browse.domain.identifier import Identifier, IdentifierException +from browse.services.document import metadata +from browse.services.document.metadata import AbsException, \ + AbsNotFoundException +from browse.services.search.search_authors import queries_for_authors, \ + split_long_author_list + +app_config = get_application_config() +logger = logging.getLogger(__name__) + +Response = Tuple[Dict[str, Any], int, Dict[str, Any]] +truncate_author_list_size = 10 +trackback_count_options = [25, 50, 100, 200] + + +def get_tb_page(arxiv_id: str) -> Response: + """Get the data needed to display the trackback page for an arXiv article. + + Parameters + ---------- + arxiv_id : str + + Returns + ------- + dict + Response data. + int + HTTP status code. + dict + Headers to add to the response. + + Raises + ------ + InternalServerError + Raised when there was an unexpected problem executing the query. + TrackbackNotFound + Raised when trackbacks for an article cannot be found, either because + the identifier is invalid or the article metadata is not available. + + """ + response_data: Dict[str, Any] = {} + response_headers: Dict[str, Any] = {} + if not arxiv_id: + raise TrackbackNotFound(data={'missing_id': True}) + try: + arxiv_identifier = Identifier(arxiv_id=arxiv_id) + redirect = check_supplied_identifier(arxiv_identifier, + 'browse.tb') + if redirect: + return redirect + response_data['arxiv_identifier'] = arxiv_identifier + abs_meta = metadata.get_abs(arxiv_identifier.id) + if abs_meta: + response_data['abs_meta'] = abs_meta + trackback_pings = get_paper_trackback_pings(arxiv_identifier.id) + response_data['trackback_pings'] = trackback_pings + if len(trackback_pings) > 0: + response_data['author_links'] = \ + split_long_author_list(queries_for_authors( + abs_meta.authors.raw), truncate_author_list_size) + response_status = status.HTTP_200_OK + + except AbsNotFoundException: + raise TrackbackNotFound(data={'arxiv_id': arxiv_id, 'not_found': True}) + except (AbsException, IdentifierException): + raise TrackbackNotFound(data={'arxiv_id': arxiv_id}) + except Exception as ex: + logger.warning(f'Error getting trackbacks: {ex}') + raise InternalServerError from ex + + return response_data, response_status, response_headers + + +def get_recent_tb_page(request_params: MultiDict) -> Response: + """Get the data needed to display the recent trackbacks page. + + Parameters + ---------- + request_params : dict + + Returns + ------- + dict + Response data. + int + HTTP status code. + dict + Headers to add to the response. + + Raises + ------ + BadRequest + Raised when form option is invalid + InternalServerError + Raised when there was an unexpected problem executing the query. + + """ + response_data: Dict[str, Any] = {} + response_headers: Dict[str, Any] = {} + max_trackbacks = trackback_count_options[0] + + views = '' + if request_params: + if 'views' in request_params: + views = request_params['views'] + else: + raise BadRequest + + try: + if views: + max_trackbacks = int(views) + recent_trackback_pings = get_recent_trackback_pings(max_trackbacks) + response_data['max_trackbacks'] = max_trackbacks + response_data['recent_trackback_pings'] = recent_trackback_pings + response_data['article_map'] = _get_article_map(recent_trackback_pings) + response_data['trackback_count_options'] = trackback_count_options + response_status = status.HTTP_200_OK + except ValueError: + raise BadRequest + except Exception as ex: + logger.warning(f'Error getting recent trackbacks: {ex}') + raise InternalServerError from ex + + return response_data, response_status, response_headers + + +def get_tb_redirect(trackback_id: str, hashed_document_id: str) -> Response: + """Get the redirect location for a trackback ID and hashed_document_id. + + Parameters + ---------- + trackback_id : str + trackback Identifier + hashed_document_id : str + MD5 hex digest of the document_id + trackback_id + secret + + Returns + ------- + dict + Response data. + int + HTTP status code. + dict + Headers to add to the response. + + Raises + ------ + BadRequest + Raised when form option is invalid + InternalServerError + Raised when there was an unexpected problem executing the query. + + """ + try: + tb_id = int(trackback_id) + if not re.match(r'^[\da-f]+$', hashed_document_id): + raise ValueError + trackback = get_trackback_ping(trackback_id=tb_id) + if trackback.hashed_document_id == hashed_document_id: + response_status = status.HTTP_301_MOVED_PERMANENTLY + return {}, response_status, {'Location': trackback.url} + except ValueError: + raise TrackbackNotFound() + except Exception as ex: + raise InternalServerError from ex + + raise TrackbackNotFound() + + +def _get_article_map(recent_trackbacks: List[Tuple]) -> Dict[str, List[tuple]]: + """Get a mapping of trackback URLs to articles to simplify display.""" + article_map: Dict[str, List[tuple]] = {} + for rtb in recent_trackbacks: + url = rtb[0].url + article = (rtb[1], rtb[2]) + if url not in article_map: + article_map[url] = [] + if article not in article_map[url]: + article_map[url].append(article) + return article_map diff --git a/browse/controllers/tb_page/tests.py b/browse/controllers/tb_page/tests.py new file mode 100644 index 000000000..1906d5282 --- /dev/null +++ b/browse/controllers/tb_page/tests.py @@ -0,0 +1,111 @@ +"""Tests for tb controllers, :mod:`browse.controllers.tb_page`.""" + +from unittest import TestCase, mock +from werkzeug import MultiDict +from werkzeug.exceptions import BadRequest +from arxiv import status +from browse.exceptions import TrackbackNotFound +from browse.controllers import tb_page + + +class TestTbPageController(TestCase): + """Tests for :func:`.get_tb_page`.""" + + @mock.patch('browse.controllers.tb_page.metadata') + @mock.patch('browse.controllers.tb_page.get_paper_trackback_pings') + # type: ignore + def test_good_id_with_trackbacks(self, mock_get_paper_trackback_pings, mock_metadata) -> None: + """Test requests with good arXiv identifiers known to the corpus.""" + mock_get_paper_trackback_pings.return_value = list() + mock_metadata.get_abs.return_value = {} + response_data, code, _ = tb_page.get_tb_page(arxiv_id='1901.99999') + self.assertEqual(code, status.HTTP_200_OK, 'Response should be OK.') + for key in ('arxiv_identifier', 'trackback_pings'): + self.assertIn(key, response_data, + f"Response data should include '{key}'") + for key in ('abs_meta', 'author_links'): + self.assertNotIn(key, response_data, + f"Response data should not include '{key}'") + + def test_bad_or_unknown_id(self) -> None: + """Test requests with bad arXiv identifiers.""" + with self.assertRaises(TrackbackNotFound): + for bad_or_unknown_id in ('foo', '1901.99999'): + tb_page.get_tb_page(arxiv_id=bad_or_unknown_id) + + +class TestRecentTbPageController(TestCase): + """Tests for :func:`.get_recent_tb_page`.""" + + @mock.patch('browse.controllers.tb_page.get_recent_trackback_pings') + def test_form_data(self, mock_get_recent_trackback_pings) -> None: # type: ignore + """Test /tb/recent form data.""" + mock_get_recent_trackback_pings.return_value = list() + + form_data = MultiDict({ + 'foo': 'bar' + }) + with self.assertRaises(BadRequest): + tb_page.get_recent_tb_page(form_data) + + form_data = MultiDict({ + 'views': 'baz' + }) + with self.assertRaises(BadRequest): + tb_page.get_recent_tb_page(form_data) + + form_data = MultiDict({ + 'views': '25' + }) + response_data, code, headers = tb_page.get_recent_tb_page(form_data) + self.assertEqual(code, status.HTTP_200_OK, 'Response should be OK.') + self.assertIn('max_trackbacks', response_data, + "Response data should include 'max_trackbacks'") + self.assertEqual(response_data['max_trackbacks'], 25, + "'max_trackbacks' should equal value from form") + self.assertIn('recent_trackback_pings', response_data, + "Response data should include 'recent_trackback_pings'") + self.assertIn('article_map', response_data, + "Response data should include 'article_map'") + + +class TestTbRedirect(TestCase): + """Tests for :func:`.get_tb_redirect`.""" + + @mock.patch('browse.controllers.tb_page.get_trackback_ping') + def test_arguments(self, mock_trackback_ping) -> None: # type: ignore + """Test /tb/redirect arguments.""" + with self.assertRaises(TrackbackNotFound): + # 'foo' is not an integer + tb_page.get_tb_redirect( + trackback_id='foo', hashed_document_id='feedface') + + with self.assertRaises(TrackbackNotFound): + # 'baz' is not a hex string + tb_page.get_tb_redirect( + trackback_id='1', hashed_document_id='baz') + + mtb = mock.Mock( + trackback_id=1, + hashed_document_id='feaedface', + url='https://example.org' + ) + mock_trackback_ping.return_value = mtb + with self.assertRaises(TrackbackNotFound): + # parameters are OK, but hashed_document_id does not match + response_data, code, headers = tb_page.get_tb_redirect( + trackback_id='1', hashed_document_id='feedface') + + mtb = mock.Mock( + trackback_id=2, + hashed_document_id='f005ba11', + url='https://example.com' + ) + mock_trackback_ping.return_value = mtb + + _, code, headers = tb_page.get_tb_redirect( + trackback_id='2', hashed_document_id='f005ba11') + self.assertEqual( + code, 301, 'Expect redirect for matching hashed_document_id') + self.assertEqual(headers['Location'], mtb.url, + 'Redirect location header matches trackback URL') diff --git a/browse/controllers/year.py b/browse/controllers/year.py new file mode 100644 index 000000000..5883905b7 --- /dev/null +++ b/browse/controllers/year.py @@ -0,0 +1,105 @@ +"""Handle requests for info about one year of archive activity""" + +from datetime import date +from typing import Any, Dict, List, Optional, Tuple, Union + +from werkzeug.exceptions import BadRequest +from flask import current_app, url_for + +from arxiv import status, taxonomy +from browse.domain.listing import MonthCount, ListingCountResponse +from browse.controllers.list_page import get_listing_service +from browse.controllers.years_operating import years_operating, stats_by_year + + +def year_page(archive_id: str, year: int) -> Any: + """Get year page for archive. + + Parameters + ---------- + archive : str + Must be an arXiv archive identifier. + + year: int + Must be a two or four digit year. + + Returns + ------- + dict + Search result response data. + int + HTTP status code. + dict + Headers to add to the response. + """ + thisYear = date.today().year + + if year is None: + year = thisYear + + if year > thisYear: + # 307 because year might be valid in the future + return {}, status.HTTP_307_TEMPORARY_REDIRECT, {'Location': '/'} + + if year < 100: + if year >= 91: + year = 1900 + year + else: + year = 2000 + year + + if archive_id not in taxonomy.ARCHIVES: + raise BadRequest("Unknown archive.") + else: + archive = taxonomy.ARCHIVES[archive_id] + + listing_service = get_listing_service() + month_listing = listing_service.monthly_counts(archive_id, year) + + for month in month_listing['month_counts']: + month['art'] = ascii_art_month(archive_id, month) # type: ignore + month['yymm'] =f"{month['year']}-{month['month']:02}" #type: ignore + month['url'] = url_for('browse.list_articles', #type: ignore + context=archive_id, + subcontext=f"{month['year']}{month['month']:02}") + + response_data: Dict[str, Any] = { + 'archive_id': archive_id, + 'archive': archive, + 'months': month_listing['month_counts'], + 'listing': month_listing, + 'year': str(year), + 'stats_by_year': stats_by_year(archive_id, archive, years_operating(archive), year) + } + response_headers: Dict[str, Any] = {} + + response_status = status.HTTP_200_OK + + return response_data, response_status, response_headers + + +ASCII_ART_STEP = 20 +ASCII_ART_CHR = '|' +ASCII_ART_URL_STEP = 100 + + +def ascii_art_month(archive_id: str, month: MonthCount) -> List[Tuple[str, Optional[str]]]: + """Make ascii art for a MonthCount.""" + tot = month['new'] + month['cross'] + yyyymm = f"{month['year']}{month['month']:02}" + + def _makestep(idx:int) -> Tuple[str, Optional[str]]: + if idx % ASCII_ART_URL_STEP == 0: + return (ASCII_ART_CHR, + url_for('browse.list_articles', + context=archive_id, + subcontext=yyyymm, + skip=idx)) + else: + return (ASCII_ART_CHR, None) + + art = [_makestep(idx) for idx in range(0, tot, ASCII_ART_STEP)] + + if tot % ASCII_ART_STEP >= ASCII_ART_STEP/2: + art.append(('!', None)) + + return art diff --git a/browse/controllers/years_operating.py b/browse/controllers/years_operating.py new file mode 100644 index 000000000..61c034385 --- /dev/null +++ b/browse/controllers/years_operating.py @@ -0,0 +1,42 @@ +"""Year link functions.""" + +from typing import Dict, Any, Tuple, List, no_type_check +from datetime import date + +from flask import url_for + + +def years_operating(archive: Dict[str, Any]) -> List[int]: + """Returns list of years operating in desc order. ex [1993,1992,1991].""" + if ( + not archive + or "start_date" not in archive + or not isinstance(archive["start_date"], date) + ): + return [] + start = archive["start_date"].year + end = archive.get("end_date", None) or date.today().year + return list(reversed(range(start, end + 1))) + + +def stats_by_year( + archive_id: str, + archive: Dict[str, Any], + years: List[int], + page_year: int=0) -> List[Tuple[str, str]]: + """Returns links to year pages.""" + if not archive or not archive_id or not years: + return [("bogusURL", "NODATA")] + else: + return [(_year_stats_link(archive_id, year, page_year), str(year)) + for year in years] + + +def _year_stats_link(archive_id: str, year: int, page_year: int = 0) -> str: + if year == page_year: + return '' + else: + return url_for( # type: ignore + "browse.year", + year=str(year)[-2:], # danger: 2 digit year, NG can accept 4 digit + archive=archive_id) diff --git a/browse/domain/__init__.py b/browse/domain/__init__.py index a17014496..4f2eb27bf 100644 --- a/browse/domain/__init__.py +++ b/browse/domain/__init__.py @@ -1,10 +1,10 @@ -""" -Domain classes for browse service. +"""Domain classes for browse service. -The domain provides a description of the main data objects used in module APIs. -Specifically, the :mod:`browse.controllers` and :mod:`browse.services` modules -should use the domain as their primary "language". This is intended to make -static checking easier and enhance overall intelligibility of the codebase. +The domain provides a description of the main data objects used in +module APIs. Specifically, the :mod:`browse.controllers` and +:mod:`browse.services` modules should use the domain as their primary +"language". This is intended to make static checking easier and enhance +overall intelligibility of the codebase. """ # pylint: disable=wildcard-import diff --git a/browse/domain/category.py b/browse/domain/category.py index 30de22ac4..71866c9a2 100644 --- a/browse/domain/category.py +++ b/browse/domain/category.py @@ -1,81 +1,7 @@ """Class that represents a single category.""" -from typing import Union, List -from dataclasses import dataclass, field - from arxiv import taxonomy -@dataclass(eq=True, order=True) -class Category: - """Represents an arXiv category. - - arXiv categories are arranged in a hierarchy where there are archives - (astro-ph, cs, math, etc.) that contain subject classes (astro-ph has - subject classes CO, GA, etc.). We now use the term category to refer - to any archive or archive.subject_class that one can submit to (so - hep-th and math.IT are both categories). No subject class can be in - more than one archive. However, our scientific advisors identify some - categories that should appear in more than one archive because they - bridge major subject areas. Examples include math.MP == math-ph and - stat.TH = math.ST. These are called category aliases and the idea is - that any article classified in one of the aliases categories also appears - in the other (canonical), but that most of the arXiv code for display, - search, etc. does not need to understand the break with hierarchy. - """ - - id: str = field(compare=True) - """The category identifier (e.g. cs.DL).""" - - name: str = field(init=False, compare=False) - """The name of the category (e.g. Digital Libraries).""" - - #TODO should probably be changed to get_canonical to avoid confusion - canonical: Union['Category', None] = field(init=False, compare=False) - - def __hash__(self)->int: - """Hash.""" - return id.__hash__() - - def __post_init__(self) -> None: - """Get the full category name.""" - if self.id in taxonomy.definitions.CATEGORIES: - self.name = taxonomy.definitions.CATEGORIES[self.id]['name'] - - if self.id in taxonomy.definitions.ARCHIVES_SUBSUMED: - self.canonical = Category(id=taxonomy.definitions.ARCHIVES_SUBSUMED[self.id]) - else: - self.canonical = None - - def unalias(self) -> 'Category': - """Follow any EQUIV or SUBSUMED to get the current category.""" - if self.id in taxonomy.definitions.CATEGORY_ALIASES: - return Category(taxonomy.definitions.CATEGORY_ALIASES[self.id]) - if self.id in taxonomy.definitions.ARCHIVES_SUBSUMED: - return Category(taxonomy.definitions.ARCHIVES_SUBSUMED[self.id]) - return self - - def display_str(self)->str: - """String to use in display of a category. - - Ex: - Earth and Planetary Astrophysics (astro-ph.EP) - """ - if self.id in taxonomy.definitions.CATEGORIES: - catname = taxonomy.definitions.CATEGORIES[self.id]['name'] - return f'{catname} ({self.id})' - sp = _split_cat_str(self.id) - hassub = len(sp) == 2 - if hassub: - (arc, _) = sp - if arc in taxonomy.definitions.ARCHIVES: - arcname = taxonomy.definitions.ARCHIVES[arc]['name'] - return f'{arcname} ({self.id})' - else: - return self.id - else: - return self.id - - -def _split_cat_str(cat: str)-> List[str]: - return cat.split('.', 2) +class Category(taxonomy.Category): + """Represents an arXiv category.""" diff --git a/browse/domain/identifier.py b/browse/domain/identifier.py index 50c15db16..08e315db7 100644 --- a/browse/domain/identifier.py +++ b/browse/domain/identifier.py @@ -4,7 +4,6 @@ from re import RegexFlag from typing import Match, Optional, Union, Tuple, Callable, List from arxiv import taxonomy -from arxiv.base.config import BASE_SERVER, EXTERNAL_URL_SCHEME # arXiv ID format used from 1991 to 2007-03 RE_ARXIV_OLD_ID = re.compile( @@ -211,24 +210,3 @@ def __eq__(self, other: object) -> bool: return self.__dict__ == other.__dict__ except AttributeError: return False - - -def canonical_url(id: str, version: int = 0)->str: - """ - Return canonical URL for this ID. - - This can be done from just the ID because the - category is only needed if it is in the ID. - id can be just the id or idv or cat/id or cat/idv - """ - # TODO: This should be better. - # There should probably be something like INTERNAL_URL_SCHEMA - # Also, /abs should probably be specified somewhere else - # like arxiv.base.canonical - # There should be a MAIN_HOSTNAME to decouple the canonical URLs - # from the hostname of the server they are being generated on. - # We might want hostnames like search.arxiv.org etc. - if version: - return f'{EXTERNAL_URL_SCHEME}://{BASE_SERVER}/abs/{id}v{version}' - else: - return f'{EXTERNAL_URL_SCHEME}://{BASE_SERVER}/abs/{id}' diff --git a/browse/domain/license.py b/browse/domain/license.py index 123e66a74..0dcf706bb 100644 --- a/browse/domain/license.py +++ b/browse/domain/license.py @@ -32,7 +32,6 @@ def license_for_recorded_license(recorded_uri: Optional[str]) -> str: failed to select a license. The classic submission system was explicitly written to not permit submitters to submit without selecting a license. - """ if recorded_uri is None: return str(ASSUMED_LICENSE_URI) diff --git a/browse/domain/listing.py b/browse/domain/listing.py new file mode 100644 index 000000000..81ac55e08 --- /dev/null +++ b/browse/domain/listing.py @@ -0,0 +1,125 @@ +"""Return types for listing service.""" + +from datetime import date +from typing import List, Tuple + +from mypy_extensions import TypedDict + + +ListingItem = TypedDict('ListingItem', + {'id': str, + 'listingType': str, + 'primary': str}) +"""A single item for a listing. + +The id is the arXiv ID and may be an idv. + +The listing type is one of 'new,'rep','cross','wdr','jref'. These +would be extended with any new types of actions/events that can happen +in the arXiv system. + +primary is the primary category of the article. + +""" + + +ListingResponse = TypedDict('ListingResponse', + {'listings': List[ListingItem], + 'pubdates': List[Tuple[date, int]], + 'count': int, + 'expires': str}) +"""listings is the list of items a time period. + +pubdates are the dates of publications. The int is the number of items +published on the associated date. + +count is the count of all the items in the listing for the query. + +expires is the time at which this data may no longer be cached. It +should be the sort of datetime that could go in an HTTP Expires response +header. It must be in rfc-1123 format ex. Wed, 22 Oct 2008 10:55:46 GMT +The timezone for this expires should be when the cache expires and does not need +to be the timezone of the listing service, listing client or web client. + +Why not just do listing: List[Tuple[date,List[ListingItem]}} ? +Because pastweek needs to support counts for the days and needs to be +able to support skip/show. +""" + +NewResponse = TypedDict('NewResponse', + {'listings': List[ListingItem], + 'new_count': int, + 'cross_count': int, + 'rep_count': int, + 'announced': date, + 'submitted': Tuple[date, date], + 'expires': str}) +""" +listings is the list of items for the most recent publish cycle. + +announced is the date of the most recent publish cycle. + +new_count is the count of new the items in the listing for the query. +rep_count is the count of rep the items in the listing for the query. +cross_count is the count of cross the items in the listing for the query. + +submitted is the start date of when these items were submitted and the end date. + +expires is the time at which this data may no longer be cached. It +should be the sort of datetime that could go in an HTTP Expires response +header. It must be in rfc-1123 format ex. Wed, 22 Oct 2008 10:55:46 GMT +The timezone for this expires should be when the cache expires and does not need +to be the timezone of the listing service, listing client or web client. + +""" + + +NotModifiedResponse = TypedDict('NotModifiedResponse', + {'not_modified': bool, + 'expires': str}) +""" +Listing response that indicates that the listing has not been modified since +the date in the if-modified-since parameter. + +expires must be in rfc-1123 format ex. Wed, 22 Oct 2008 10:55:46 GMT +The timezone for this expires should be when the cache expires and does not need +to be the timezone of the listing service, listing client or web client. + +""" + + + +MonthCount = TypedDict('MonthCount', + {'year': str, + 'month': str, + 'new': int, + 'cross': int}) +"""A single month's count for an archive. + +year is the year the listing is for. + +month is the month the listing is for. + +new is the count of new listings for that month. + +cross is the count of crosses for that month. + +rep is the count of replaced for that month. + +""" + + +ListingCountResponse = TypedDict('ListingCountResponse', + {'month_counts': List[MonthCount], + 'new_count': int, + 'cross_count': int}) +"""Response with the counts for an archive for a given year. + +month_counts are counts for individual months. + +new_count is the count of new articles for the year. + +cross_count is the count of cross articles for the year. + +rep_count is the count of replaced articles for the year. +""" diff --git a/browse/domain/metadata.py b/browse/domain/metadata.py index a4d050f2a..b17a1dadc 100644 --- a/browse/domain/metadata.py +++ b/browse/domain/metadata.py @@ -1,11 +1,12 @@ """Representations of arXiv document metadata.""" import collections -from typing import List, Optional, Iterator +from typing import List, Optional, Iterator, Set from datetime import datetime from dataclasses import dataclass, field from arxiv import taxonomy -from browse.domain.identifier import Identifier, canonical_url +from arxiv.base.urls import canonical_url +from browse.domain.identifier import Identifier from browse.domain.license import License from browse.domain.category import Category @@ -66,27 +67,13 @@ def __str__(self) -> str: return self.raw -@dataclass -class Archive(Category): - """Represents an arXiv archive--the middle level of the taxonomy.definitions.""" - - def __post_init__(self) -> None: - """Get the full archive name.""" - super().__post_init__() - if self.id in taxonomy.definitions.ARCHIVES: - self.name = taxonomy.definitions.ARCHIVES[self.id]['name'] +class Archive(taxonomy.Archive): + """Represents an arXiv archive--the middle level of the taxonomy.""" -@dataclass -class Group(Category): +class Group(taxonomy.Group): """Represents an arXiv group--the highest (most general) taxonomy level.""" - def __post_init__(self) -> None: - """Get the full group name.""" - super().__post_init__() - if self.id in taxonomy.definitions.GROUPS: - self.name = taxonomy.definitions.GROUPS[self.id]['name'] - @dataclass(frozen=True) class DocMetadata: @@ -212,9 +199,9 @@ def get_browse_context_list(self) -> List[str]: def highest_version(self) -> int: """Return highest version number from metadata. - This is determined by counting the entries in the - {history}. Return 1 if the metadata is private. Returns undef - if this object is not initialized. + This is determined by counting the entries in the {history}. + Return 1 if the metadata is private. Returns undef if this + object is not initialized. """ if self.private: return 1 @@ -229,7 +216,6 @@ def get_datetime_of_version( version: Version to get datetime of. Must be in range 1..highest_version. Uses highest_version if not specified. - """ if not version: version = self.highest_version() @@ -245,13 +231,13 @@ def get_datetime_of_version( else: return versions[0].submitted_date - def display_secondaries(self) -> List[str]: - """Unalias, dedup and sort secondaries for display.""" + def get_secondaries(self) -> Set[Category]: + """Unalias and deduplicate secondary categories.""" if not self.secondary_categories or not self.primary_category: - return [] + return set() def unalias(secs: Iterator[Category])->Iterator[Category]: - return map(lambda c: c.unalias(), secs) + return map(lambda c: Category(c.unalias()), secs) prim = self.primary_category.unalias() def de_prim(secs: Iterator[Category])->Iterator[Category]: @@ -259,15 +245,22 @@ def de_prim(secs: Iterator[Category])->Iterator[Category]: de_primaried = set(de_prim(unalias(iter(self.secondary_categories)))) if not de_primaried: - return [] + return set() + return de_primaried + + def display_secondaries(self) -> List[str]: + """Unalias, dedup and sort secondaries for display.""" + de_primaried = self.get_secondaries() def to_display(secs: List[Category]) -> List[str]: - return list(map(lambda c: c.display_str(), secs)) + return list(map(lambda c: str(c.display), secs)) return to_display(sorted(de_primaried)) def canonical_url(self, no_version: bool = False) -> str: - """Returns canonical URL for this ID and version.""" + """Return canonical URL for this ID and version.""" + url: str if no_version: - return canonical_url(self.arxiv_identifier.id) + url = canonical_url(self.arxiv_identifier.id) else: - return canonical_url(self.arxiv_identifier.idv) + url = canonical_url(self.arxiv_identifier.idv) + return url diff --git a/browse/exceptions.py b/browse/exceptions.py index cac4b96fc..f2c685b0d 100644 --- a/browse/exceptions.py +++ b/browse/exceptions.py @@ -26,7 +26,30 @@ def __init__(self, description: Optional[str] = None, def handle_abs_not_found(error: AbsNotFound) -> Response: """Render the base 404 error page for abs.""" rendered = render_template('abs/404.html', **error.data) - response = make_response(rendered) + response: Response = make_response(rendered) + response.status_code = status.HTTP_404_NOT_FOUND + return response + + +class TrackbackNotFound(HTTPException): + """Trackback not found HTTPException.""" + + code = 404 + description = 'Article does not exist' + + def __init__(self, description: Optional[str] = None, + response: Optional[Response] = None, + data: dict = {}) -> None: + """Override default to support data dict.""" + self.data = data + super(TrackbackNotFound, self).__init__(description, response) + + +@handler(TrackbackNotFound) +def handle_trackback_not_found(error: TrackbackNotFound) -> Response: + """Render the base 404 error page for tb.""" + rendered = render_template('tb/404.html', **error.data) + response: Response = make_response(rendered) response.status_code = status.HTTP_404_NOT_FOUND return response @@ -35,6 +58,6 @@ def handle_abs_not_found(error: AbsNotFound) -> Response: def handle_bad_request(error: BadRequest) -> Response: """Render the 400 error page for browse.""" rendered = render_template('400.html', error=error) - response = make_response(rendered) + response: Response = make_response(rendered) response.status_code = status.HTTP_400_BAD_REQUEST return response diff --git a/browse/factory.py b/browse/factory.py index 74f8021e4..1c897de54 100644 --- a/browse/factory.py +++ b/browse/factory.py @@ -1,25 +1,23 @@ """Application factory for browse service components.""" from functools import partial -from typing import Any -from flask import Flask, url_for -from browse.domain.identifier import canonical_url -from browse.util.clickthrough import create_ct_url -from browse.util.id_patterns import do_dois_id_urls_to_tags, do_id_to_tags, \ - do_dois_arxiv_ids_to_tags +from flask import Flask, g + +from arxiv.base.urls import canonical_url, clickthrough_url, urlizer from browse.routes import ui from browse.services.database import models from browse.services.util.email import generate_show_email_hash -from browse.filters import line_feed_to_br, tex_to_utf, entity_to_utf, \ - single_doi_url +from browse.filters import entity_to_utf +from browse.services.listing.fake_listings import FakeListingFilesService from arxiv.base.config import BASE_SERVER from arxiv.base import Base +from arxiv.users.auth import Auth def create_web_app() -> Flask: """Initialize an instance of the browse web application.""" app = Flask('browse', static_folder='static', template_folder='templates') - app.config.from_pyfile('config.py') + app.config.from_pyfile('config.py') # type: ignore # TODO Only needed until this route is added to arxiv-base if 'URLS' not in app.config: @@ -27,53 +25,29 @@ def create_web_app() -> Flask: app.config['URLS'].append( ('search_archive', '/search/', BASE_SERVER)) - models.init_app(app) + models.init_app(app) # type: ignore Base(app) + Auth(app) app.register_blueprint(ui.blueprint) - - ct_url_for = partial(create_ct_url, app.config.get( - 'CLICKTHROUGH_SECRET'), url_for) - + if not app.jinja_env.globals: app.jinja_env.globals = {} app.jinja_env.globals['canonical_url'] = canonical_url - def ct_single_doi_filter(doi: str)->str: - return single_doi_url(ct_url_for, doi) - - def _id_to_url(id: str)->Any: - return url_for('browse.abstract', arxiv_id=id) - - def contextualized_id_filter(text: str)->str: - return do_id_to_tags(_id_to_url, text) - - def contextualized_doi_id_url_filter(text: str)->str: - return do_dois_id_urls_to_tags(_id_to_url, ct_url_for, text) - - def ct_doi_filter(text: str)->str: - return do_dois_arxiv_ids_to_tags(_id_to_url, - ct_url_for, - text) - if not app.jinja_env.filters: app.jinja_env.filters = {} - app.jinja_env.filters['line_feed_to_br'] = line_feed_to_br - app.jinja_env.filters['tex_to_utf'] = partial( tex_to_utf, letters=True ) - app.jinja_env.filters['tex_to_utf_no_symb'] = partial( tex_to_utf, letters=False) - app.jinja_env.filters['entity_to_utf'] = entity_to_utf - app.jinja_env.filters['clickthrough_url_for'] = ct_url_for + app.jinja_env.filters['clickthrough_url_for'] = clickthrough_url app.jinja_env.filters['show_email_hash'] = \ partial(generate_show_email_hash, secret=app.config.get('SHOW_EMAIL_SECRET')) - app.jinja_env.filters['single_doi_url'] = ct_single_doi_filter - app.jinja_env.filters['arxiv_id_urls'] = contextualized_id_filter - app.jinja_env.filters['arxiv_urlize'] = contextualized_doi_id_url_filter - app.jinja_env.filters['arxiv_id_doi_filter'] = ct_doi_filter + app.jinja_env.filters['arxiv_id_urls'] = urlizer(['arxiv_id']) + app.jinja_env.filters['arxiv_urlize'] = urlizer(['arxiv_id', 'doi', 'url']) + app.jinja_env.filters['arxiv_id_doi_filter'] = urlizer(['arxiv_id', 'doi']) return app diff --git a/browse/filters.py b/browse/filters.py index 7c0851c3c..4503c422a 100644 --- a/browse/filters.py +++ b/browse/filters.py @@ -1,12 +1,10 @@ """Browse jinja filters.""" import re -from urllib import parse -from typing import Callable, Union +from typing import Union -from jinja2 import Markup, escape +from jinja2 import Markup import html -from browse.services.util.tex2utf import tex2utf JinjaFilterInput = Union[Markup, str] """ @@ -19,40 +17,8 @@ """ -def single_doi_url(clickthrough_url_for: Callable[[str], str], - doi: JinjaFilterInput) -> Markup: - """ - DOI is made into a link. - - This expects a DOI ONLY. It should not be used on general text. - - This link is not through clickthrough. Use an additional filter in - the template to get that. - - How does this ensure escaping? It expects just a DOI, The result - is created as a properly escaped Markup. - """ - doi_url = f'https://dx.doi.org/{parse.quote_plus(doi)}' - ct_url = clickthrough_url_for(doi_url) - return Markup(f'{escape(doi)}') - - -def line_feed_to_br(text: JinjaFilterInput) -> Markup: - """Lines that start with two spaces should be broken.""" - if hasattr(text, '__html__'): - etxt = text - else: - etxt = Markup(escape(text)) - - # if line starts with spaces, replace the white space with - br = re.sub(r'((?', etxt) - dedup = re.sub(r'\n\n', '\n', br) # skip if blank - return Markup(dedup) - - def entity_to_utf(text: str) -> str: - """ - Convert HTML entities to unicode. + """Convert HTML entities to unicode. For example '&' becomes '&'. @@ -78,23 +44,3 @@ def entity_to_utf(text: str) -> str: with_lt_gt = re.sub('XXX_GREATER_THAN_XXX', '>', with_lt) return Markup(with_lt_gt) - - -def tex_to_utf(text: JinjaFilterInput, letters: bool=True) -> Markup: - """ - Convert some TeX accents and symbols to UTF-8 characters. - - :param text: Text to filter. - - :param letters: If False, do not convert greek symbols. Greek - symbols can cause problems. Ex \phi is not suppose to look like φ. - φ looks like \varphi to someone use to TeX. - See ARXIVNG-1612 - - :returns: Jinja Markup of filtered text - """ - if hasattr(text, '__html__'): - # Need to unescape so nothing that is tex is escaped - return Markup(escape(tex2utf(text.unescape(), letters=letters))) # type: ignore - else: - return Markup(escape(tex2utf(text, letters=letters))) diff --git a/browse/routes/ui.py b/browse/routes/ui.py index 6b38f960c..237eb0eeb 100644 --- a/browse/routes/ui.py +++ b/browse/routes/ui.py @@ -1,22 +1,36 @@ """Provides the user intefaces for browse.""" import re -from typing import Union +from datetime import datetime +from typing import Callable, Dict, Mapping, Union from flask import Blueprint, render_template, request, Response, session, \ - redirect, current_app + current_app, url_for, redirect from werkzeug.exceptions import InternalServerError, BadRequest, NotFound + from arxiv import status -from browse.controllers import abs_page +from arxiv.base import logging +from arxiv.base.urls.clickthrough import is_hash_valid +from browse.controllers import abs_page, archive_page, home_page, list_page, \ + prevnext, tb_page, stats_page +from browse.controllers.cookies import get_cookies_page, cookies_to_set from browse.exceptions import AbsNotFound -from browse.util.clickthrough import is_hash_valid from browse.services.database import get_institution +from browse.controllers.year import year_page + +logger = logging.getLogger(__name__) + +blueprint = Blueprint('browse', __name__, url_prefix='/') -blueprint = Blueprint('browse', __name__, url_prefix='') + +@blueprint.context_processor +def inject_now() -> None: + return dict(request_datetime=datetime.now()) @blueprint.before_request def before_request() -> None: """Get instituional affiliation from session.""" if 'institution' not in session: + logger.debug('Adding institution to session') session['institution'] = get_institution(request.remote_addr) @@ -30,14 +44,25 @@ def apply_response_headers(response: Response) -> Response: return response -@blueprint.route('/abs', methods=['GET']) +@blueprint.route('index', methods=['GET']) +@blueprint.route('/', methods=['GET']) +def home() -> Response: + """Home page view.""" + response, code, headers = home_page.get_home_page() + if code == status.HTTP_200_OK: + return render_template('home/home.html', **response), code, headers # type: ignore + + raise InternalServerError('Unexpected error') + + +@blueprint.route('abs', methods=['GET']) def bare_abs() -> Response: """Check several legacy request parameters.""" if request.args: if 'id' in request.args: - return abstract(request.args['id']) + return abstract(request.args['id']) # type: ignore elif 'archive' in request.args and 'papernum' in request.args: - return abstract( + return abstract( # type: ignore f"{request.args['archive']}/{request.args['papernum']}") else: for param in request.args: @@ -45,14 +70,14 @@ def bare_abs() -> Response: # e.g. /abs?/\d{7} if not request.args[param] \ and re.match(r'^[a-z\-]+(\.[A-Z]{2})?\/\d{7}$', param): - return abstract(param) + return abstract(param) # type: ignore """Return abs-specific 404.""" raise AbsNotFound -@blueprint.route('/abs/', methods=['GET'], defaults={'arxiv_id': ''}) -@blueprint.route('/abs/', methods=['GET']) +@blueprint.route('abs/', methods=['GET'], defaults={'arxiv_id': ''}) +@blueprint.route('abs/', methods=['GET']) def abstract(arxiv_id: str) -> Response: """Abstract (abs) page view.""" response, code, headers = abs_page.get_abs_page(arxiv_id) @@ -64,97 +89,193 @@ def abstract(arxiv_id: str) -> Response: return Response( response['abs_meta'].raw_safe, mimetype='text/plain') - return render_template('abs/abs.html', **response), code, headers + return render_template('abs/abs.html', **response), code, headers # type: ignore elif code == status.HTTP_301_MOVED_PERMANENTLY: - return redirect(headers['Location'], code=code) + return redirect(headers['Location'], code=code) # type: ignore elif code == status.HTTP_304_NOT_MODIFIED: - return '', code, headers + return '', code, headers # type: ignore + + raise InternalServerError('Unexpected error') + + +@blueprint.route('tb/', defaults={'arxiv_id': ''}, methods=['GET']) +@blueprint.route('tb/', methods=['GET']) +def tb(arxiv_id: str) -> Response: + """Get trackbacks associated with an article.""" + response, code, headers = tb_page.get_tb_page(arxiv_id) + + if code == status.HTTP_200_OK: + return render_template('tb/tb.html', **response), code, headers # type: ignore + elif code == status.HTTP_301_MOVED_PERMANENTLY: + return redirect(headers['Location'], code=code) # type: ignore + raise InternalServerError('Unexpected error') + + +@blueprint.route('tb/recent', methods=['GET', 'POST']) +def tb_recent() -> Response: + """Get the recent trackbacks that have been posted across the site.""" + response, code, headers = tb_page.get_recent_tb_page(request.form) + + if code == status.HTTP_200_OK: + return render_template('tb/recent.html', **response), code, headers # type: ignore + raise InternalServerError('Unexpected error') + + +@blueprint.route('tb/redirect/', + methods=['GET'], + defaults={'trackback_id': '', 'hashed_document_id': ''}) +@blueprint.route('tb/redirect//', + methods=['GET']) +def tb_redirect(trackback_id: str, hashed_document_id: str) -> Response: + """Get the trackback redirect link.""" + response, code, headers = tb_page.get_tb_redirect(trackback_id, + hashed_document_id) + if code == status.HTTP_301_MOVED_PERMANENTLY: + return redirect(headers['Location'], code=code) # type: ignore + raise InternalServerError('Unexpected error') + +@blueprint.route('prevnext', methods=['GET', 'POST']) +def previous_next() -> Union[str, Response]: + """Previous/Next navigation used on /abs page.""" + if not request.args: + raise BadRequest + response, code, headers = prevnext.get_prevnext(request.args) + if code == status.HTTP_301_MOVED_PERMANENTLY: + return redirect(headers['Location'], code=code) # type: ignore raise InternalServerError('Unexpected error') -@blueprint.route('/trackback/', methods=['GET'], defaults={'arxiv_id': ''}) -@blueprint.route('/trackback/', methods=['GET', 'POST']) +@blueprint.route('trackback/', methods=['GET'], defaults={'arxiv_id': ''}) +@blueprint.route('trackback/', methods=['GET', 'POST']) def trackback(arxiv_id: str) -> Union[str, Response]: """Route to define new trackbacks for papers.""" raise InternalServerError(f'Not Yet Implemented {arxiv_id}') -@blueprint.route('/ct') +@blueprint.route('ct') def clickthrough() -> Response: """Generate redirect for clickthrough links.""" if 'url' in request.args and 'v' in request.args: if is_hash_valid(current_app.config['CLICKTHROUGH_SECRET'], request.args.get('url'), request.args.get('v')): - return redirect(request.args.get('url')) + return redirect(request.args.get('url')) # type: ignore else: raise BadRequest('Bad click-through redirect') raise NotFound -@blueprint.route('/list//') -def list_articles(current_context: str, yymm: str) -> Response: +@blueprint.route('list', defaults={'context': '', 'subcontext': ''}, + methods=['GET', 'POST']) +@blueprint.route('list/', defaults={'context': '', 'subcontext': ''}, + methods=['GET', 'POST']) +@blueprint.route('list//', methods=['GET', 'POST']) +def list_articles(context: str, subcontext: str) -> Response: """ List articles by context, month etc. - Context might be a context or an archive - Subcontext should be 'recent' 'new' or a string of format yymm + Context might be a context or an archive; Subcontext should be + 'recent', 'new' or a string of format YYMM. """ - raise InternalServerError(f'Not yet implemented {current_context} {yymm}') + response, code, headers = \ + list_page.get_listing(context, subcontext) # type: ignore + if code == status.HTTP_200_OK: + # TODO if it is a HEAD request we don't want to render the template + return render_template(response['template'], **response), code, headers # type: ignore + elif code == status.HTTP_301_MOVED_PERMANENTLY: + return redirect(headers['Location'], code=code) # type: ignore + elif code == status.HTTP_304_NOT_MODIFIED: + return '', code, headers # type: ignore + return response, code, headers # type: ignore + + +@blueprint.route('stats/', + methods=['GET']) +def stats(command: str) -> Response: + """Display various statistics about the service.""" + params: Dict = {} + if request.args and 'date' in request.args: + params['requested_date_str'] = str(request.args['date']) + + getters: Mapping[str, Mapping[str, Union[Callable, Union[Dict, Mapping]]]] = { + 'today': {'func': stats_page.get_hourly_stats_page, 'params': params}, + 'monthly_submissions': + {'func': stats_page.get_monthly_submissions_page, 'params': {}}, + 'monthly_downloads': + {'func': stats_page.get_monthly_downloads_page, 'params': {}} + } + csv_getters: Mapping[str, Mapping[str, Union[Callable, Union[Dict, Mapping]]]] = { + 'get_hourly': + {'func': stats_page.get_hourly_stats_csv, 'params': params}, + 'get_monthly_downloads': + {'func': stats_page.get_download_stats_csv, 'params': {}}, + 'get_monthly_submissions': + {'func': stats_page.get_submission_stats_csv, 'params': {}} + } + if not command: + raise NotFound + if command in csv_getters: + csv_getter_params: Mapping = csv_getters[command]['params'] # type: ignore + [response, code, headers] = csv_getters[command]['func']( # type: ignore + **csv_getter_params) + if code == status.HTTP_200_OK: + return response['csv'], code, headers # type: ignore + elif command in getters: + getter_params: Mapping = getters[command]['params'] # type: ignore + [response, code, headers] = getters[command]['func'](**getter_params) # type: ignore + if code == status.HTTP_200_OK: + return render_template(f'stats/{command}.html', **response), code, headers # type: ignore + else: + raise NotFound + raise InternalServerError('Unexpected error') -@blueprint.route('/format/') +@blueprint.route('format/') def format(arxiv_id: str) -> Response: """Get formats article.""" raise InternalServerError(f'Not yet implemented {arxiv_id}') -@blueprint.route('/pdf/') +@blueprint.route('pdf/') def pdf(arxiv_id: str) -> Response: """Get PDF for article.""" raise InternalServerError(f'Not yet implemented {arxiv_id}') -@blueprint.route('/div/') +@blueprint.route('div/') def div(arxiv_id: str) -> Response: """Get div for article.""" raise InternalServerError(f'Not yet implemented {arxiv_id}') -@blueprint.route('/e-print/') +@blueprint.route('e-print/') def eprint(arxiv_id: str) -> Response: """Get e-print (source) for article.""" raise InternalServerError(f'Not yet implemented {arxiv_id}') -@blueprint.route('/html/') +@blueprint.route('html/') def html(arxiv_id: str) -> Response: """Get html for article.""" raise InternalServerError(f'Not yet implemented {arxiv_id}') -@blueprint.route('/ps/') +@blueprint.route('ps/') def ps(arxiv_id: str) -> Response: """Get ps for article.""" raise InternalServerError(f'Not yet implemented {arxiv_id}') -@blueprint.route('/src//anc', defaults={'file_name': None}) -@blueprint.route('/src//anc/') +@blueprint.route('src//anc', defaults={'file_name': None}) +@blueprint.route('src//anc/') def src(arxiv_id: str, file_name: str) -> Response: """Get src for article.""" raise InternalServerError(f'Not Yet Implemented {arxiv_id} {file_name}') -@blueprint.route('/tb/') -def tb(arxiv_id: str) -> Response: - """Get trackbacks for article.""" - raise InternalServerError(f'Not yet implemented {arxiv_id}') - - -@blueprint.route('/show-email//') +@blueprint.route('show-email//') def show_email(show_email_hash: str, arxiv_id: str) -> Response: """Show the email for the submitter for an article.""" raise InternalServerError( @@ -164,25 +285,66 @@ def show_email(show_email_hash: str, arxiv_id: str) -> Response: # Maybe auth protected URL in arxiv-browse? # ('will the auth service allow paths not defined in it's # repo to be protected?') -@blueprint.route('/auth/show-endorsers/') +@blueprint.route('auth/show-endorsers/') def show_endorsers(arxiv_id: str) -> Response: """Show endorsers for an article.""" raise InternalServerError(f'Not yet implemented {arxiv_id}') -@blueprint.route('/refs/') +@blueprint.route('refs/') def refs(arxiv_id: str) -> Response: """Show the references for an article.""" raise InternalServerError(f'Not yet implemented {arxiv_id}') -@blueprint.route('/cits/') +@blueprint.route('cits/') def cits(arxiv_id: str) -> Response: """Show the citations for an artcile.""" raise InternalServerError(f'Not yet implemented {arxiv_id}') -@blueprint.route('/form') +@blueprint.route('form') def form(arxiv_id: str) -> Response: """Old form interface to lists of articles.""" raise InternalServerError(f'Not yet implemented {arxiv_id}') + + +@blueprint.route('archive/', defaults={'archive': None}) +@blueprint.route('archive/') +def archive(archive: str): # type: ignore + """Landing page for an archive.""" + response, code, headers = archive_page.get_archive(archive) # type: ignore + if code == status.HTTP_200_OK or code == status.HTTP_404_NOT_FOUND: + return render_template(response['template'], **response), code, headers + elif code == status.HTTP_301_MOVED_PERMANENTLY: + return redirect(headers['Location'], code=code) + elif code == status.HTTP_304_NOT_MODIFIED: + return '', code, headers + return response, code, headers + + +@blueprint.route('year/', defaults={'year': None}) +@blueprint.route('year//', defaults={'year': None}, strict_slashes=False) +@blueprint.route('year///') +@blueprint.route('year//') +def year(archive: str, year: int): # type: ignore + """Year's stats for an archive.""" + response, code, headers = year_page(archive, year) + if code == status.HTTP_307_TEMPORARY_REDIRECT: + return '', code, headers + return render_template('year.html', **response), code, headers + + +@blueprint.route('cookies', defaults={'set': ''}) +@blueprint.route('cookies/', methods=['POST', 'GET']) +def cookies(set): # type: ignore + """Cookies landing page and setter.""" + is_debug = request.args.get('debug', None) is not None + if request.method == 'POST': + debug = {'debug': '1'} if is_debug else {} + resp = redirect(url_for('browse.cookies', **debug)) + for ctoset in cookies_to_set(request): + resp.set_cookie(**ctoset) # type: ignore + return resp + response, code, headers = get_cookies_page(is_debug) + return render_template('cookies.html', **response), code, headers diff --git a/browse/services/database/__init__.py b/browse/services/database/__init__.py index 8ef7347ac..ea746fa62 100644 --- a/browse/services/database/__init__.py +++ b/browse/services/database/__init__.py @@ -1,9 +1,10 @@ """Import db instance and define utility functions.""" import ipaddress -from datetime import datetime +from datetime import date, datetime from dateutil.tz import tzutc, gettz -from typing import List, Optional, Any, Callable +from typing import List, Optional, Any, Callable, Tuple +from sqlalchemy import not_, desc, asc from sqlalchemy.sql import func from sqlalchemy.orm import Query from sqlalchemy.orm.exc import NoResultFound @@ -12,12 +13,16 @@ from browse.services.database.models import db, Document, \ MemberInstitution, MemberInstitutionIP, TrackbackPing, SciencewisePing, \ - DBLP, DBLPAuthor, DBLPDocumentAuthor + DBLP, DBLPAuthor, DBLPDocumentAuthor, StatsMonthlySubmission, \ + StatsMonthlyDownload +from browse.services.database.models import in_category, stats_hourly +from browse.domain.identifier import Identifier from arxiv.base import logging from logging import Logger logger = logging.getLogger(__name__) app_config = get_application_config() +tz = gettz(app_config.get('ARXIV_BUSINESS_TZ', 'US/Eastern')) def db_handle_error(logger: Logger, default_return_val: Any) \ @@ -96,9 +101,52 @@ def get_all_trackback_pings() -> List[TrackbackPing]: @db_handle_error(logger=logger, default_return_val=[]) -def get_trackback_pings(paper_id: str) -> List[TrackbackPing]: +def get_paper_trackback_pings(paper_id: str) -> List[TrackbackPing]: """Get trackback pings for a particular document (paper_id).""" - return list(__paper_trackbacks_query(paper_id).all()) + return list(__paper_trackbacks_query(paper_id) + .distinct(TrackbackPing.url) + .group_by(TrackbackPing.url) + .order_by(TrackbackPing.posted_date.desc()).all()) + + +@db_handle_error(logger=logger, default_return_val=None) +def get_trackback_ping(trackback_id: int) -> Optional[TrackbackPing]: + """Get an individual trackback ping by its id (trackback_id).""" + trackback: TrackbackPing = db.session.query(TrackbackPing).\ + filter(TrackbackPing.trackback_id == trackback_id).first() + return trackback + + +@db_handle_error(logger=logger, default_return_val=list()) +def get_recent_trackback_pings(max_trackbacks: int = 25) \ + -> List[Tuple[TrackbackPing, str, str]]: + """Get recent trackback pings across all of arXiv.""" + max_trackbacks = min(max(max_trackbacks, 0), 500) + if max_trackbacks == 0: + return list() + + # subquery to get the specified number of distinct trackback URLs + stmt = ( + db.session.query(TrackbackPing.url). + filter(TrackbackPing.status == 'accepted'). + distinct(TrackbackPing.url). + group_by(TrackbackPing.url). + order_by(TrackbackPing.posted_date.desc()). + limit(max_trackbacks). + subquery() + ) + tb_doc_tup = db.session.query( + TrackbackPing, + Document.paper_id, + Document.title + ).\ + join(Document, TrackbackPing.document_id == Document.document_id).\ + filter(TrackbackPing.status == 'accepted').\ + filter(TrackbackPing.url == stmt.c.url).\ + order_by(TrackbackPing.posted_date.desc()).\ + all() + + return list(tb_doc_tup) @db_handle_error(logger=logger, default_return_val=None) @@ -155,3 +203,139 @@ def get_dblp_authors(paper_id: str) -> List[str]: order_by(DBLPDocumentAuthor.position).all() authors = [a for (a,) in authors_t] return authors + + +@db_handle_error(logger=logger, default_return_val=None) +def get_document_count() -> Optional[int]: + """Get the number of documents.""" + # func.count is used here because .count() forces a subquery which + # is inefficient + row = db.session.query( + func.count(Document.document_id).label('num_documents') + ).filter(not_(Document.paper_id.like('test%'))).first() + return row.num_documents + + +@db_handle_error(logger=logger, default_return_val=None) +def get_sequential_id(paper_id: Identifier, + context: str = 'all', + is_next: bool = True) -> Optional[str]: + """Get the next or previous paper ID in sequence.""" + query = db.session.query(Document.paper_id) + if paper_id.is_old_id: + # NB: classic did not support old identifiers in prevnext + if context == 'all': + like_id = f'{paper_id.archive}/{paper_id.yymm}%' + else: + like_id = f'%/{paper_id.yymm}%' + else: + like_id = f'{paper_id.yymm}.%' + query = query.filter(Document.paper_id.like(like_id)) + + if is_next: + query = query.filter(Document.paper_id > paper_id.id). \ + order_by(asc(Document.paper_id)) + else: + query = query.filter(Document.paper_id < paper_id.id). \ + order_by(desc(Document.paper_id)) + if context != 'all': + archive: str = context + subject_class: str = '' + if '.' in archive: + (archive, subject_class) = archive.split('.', 1) + query = query.join(in_category).filter( + in_category.c.archive == archive) + if subject_class: + query = query.filter( + in_category.c.subject_class == subject_class) + + result = query.first() + + if result: + return f'{result.paper_id}' + return None + + +def __all_hourly_stats_query() -> Query: + return db.session.query(stats_hourly) + + +@db_handle_error(logger=logger, default_return_val=(0, 0, 0)) +def get_hourly_stats_count(stats_date: Optional[date]) -> Tuple[int, int]: + """Get sum of normal/admin connections and nodes for a given date.""" + stats_date = date.today() if not isinstance(stats_date, date) \ + else stats_date + normal_count = 0 + admin_count = 0 + num_nodes = 0 + rows = db.session.query( + func.sum(stats_hourly.c.connections).label('num_connections'), + stats_hourly.c.access_type, + func.max(stats_hourly.c.node_num).label('num_nodes')).\ + filter(stats_hourly.c.ymd == stats_date.isoformat()).\ + group_by(stats_hourly.c.access_type).all() + for r in rows: + if r.access_type == 'A': + admin_count = r.num_connections + else: + normal_count = r.num_connections + num_nodes = r.num_nodes + return (normal_count, admin_count, num_nodes) + + +@db_handle_error(logger=logger, default_return_val=[]) +def get_hourly_stats(stats_date: Optional[date] = None) -> List: + """Get the hourly stats for a given date.""" + stats_date = date.today() if not isinstance(stats_date, date) \ + else stats_date + + return list(__all_hourly_stats_query(). + filter(stats_hourly.c.access_type == 'N', + stats_hourly.c.ymd == stats_date.isoformat()). + order_by(asc(stats_hourly.c.hour), stats_hourly.c.node_num). + all()) + + +@db_handle_error(logger=logger, default_return_val=[]) +def get_monthly_submission_stats() -> List: + """Get the monthly submission stats.""" + return list(db.session.query(StatsMonthlySubmission). + order_by(asc(StatsMonthlySubmission.ym)).all()) + + +@db_handle_error(logger=logger, default_return_val=(0, 0)) +def get_monthly_submission_count() -> Tuple[int, int]: + """Get submission totals: number of submissions and number migrated.""" + row = db.session.query( + func.sum( + StatsMonthlySubmission.num_submissions).label('num_submissions'), + func.sum( + StatsMonthlySubmission.historical_delta).label('num_migrated') + ).first() + return (row.num_submissions, row.num_migrated) + + +@db_handle_error(logger=logger, default_return_val=[]) +def get_monthly_download_stats() -> List: + """Get all the monthly download stats.""" + return list(db.session.query(StatsMonthlyDownload). + order_by(asc(StatsMonthlyDownload.ym)).all()) + + +@db_handle_error(logger=logger, default_return_val=0) +def get_monthly_download_count() -> int: + """Get the sum of monthly downloads for all time.""" + row = db.session.query( + func.sum(StatsMonthlyDownload.downloads).label('total_downloads') + ).first() + total_downloads: int = row.total_downloads if row else 0 + return total_downloads + + +@db_handle_error(logger=logger, default_return_val=None) +def get_max_download_stats_dt() -> Optional[datetime]: + """Get the datetime of the most recent download stats.""" + row = db.session.query( + func.max(StatsMonthlyDownload.ym).label('max_ym') + ).first() + return row.max_ym if row else None diff --git a/browse/services/database/models.py b/browse/services/database/models.py index 89acb9d42..3d91f78d4 100644 --- a/browse/services/database/models.py +++ b/browse/services/database/models.py @@ -1,14 +1,26 @@ """arXiv browse database models.""" +import re +import hashlib from typing import Optional +from validators import url as is_valid_url +from datetime import datetime +from dateutil.tz import tzutc, gettz from flask_sqlalchemy import SQLAlchemy -from sqlalchemy import BigInteger, Column, DateTime, Enum, ForeignKey, Index, \ - Integer, SmallInteger, String, text, Text +from sqlalchemy import BigInteger, Column, Date, DateTime, Enum, ForeignKey, \ + ForeignKeyConstraint, Index, \ + Integer, SmallInteger, String, Table, text, Text from sqlalchemy.orm import relationship from werkzeug.local import LocalProxy +from arxiv.base.globals import get_application_config db: SQLAlchemy = SQLAlchemy() +app_config = get_application_config() +tz = gettz(app_config.get('ARXIV_BUSINESS_TZ', 'US/Eastern')) +tb_secret = app_config.get('TRACKBACK_SECRET', 'baz') +metadata = db.metadata + class Document(db.Model): """Model for documents stored as part of the arXiv repository.""" @@ -30,6 +42,9 @@ class Document(db.Model): created = Column(DateTime) submitter = relationship('User') + trackback_ping = relationship('TrackbackPing', + primaryjoin="foreign(TrackbackPing.document_id)==Document.document_id") + class License(db.Model): """Model for arXiv licenses.""" @@ -245,6 +260,32 @@ class TrackbackPing(db.Model): server_default=text("'pending'")) site_id = Column(Integer) + document = relationship('Document', + primaryjoin="foreign(Document.document_id)==TrackbackPing.document_id") + + @property + def posted_datetime(self) -> DateTime: + """Get posted_date as UTC datetime.""" + dt = datetime.fromtimestamp(self.posted_date, tz=tz) + return dt.astimezone(tz=tzutc()) + + @property + def display_url(self) -> str: + """Get the URL without the protocol, for display.""" + return re.sub(r'^[a-z]+:\/\/', '', # type: ignore + self.url.strip(), flags=re.IGNORECASE) + + @property + def has_valid_url(self) -> bool: + """Determine whether the trackback URL is valid.""" + return bool(is_valid_url(self.url, public=False)) + + @property + def hashed_document_id(self) -> str: + """Get the hashed document_id.""" + s = f'{self.document_id}{self.trackback_id}{tb_secret}' + return hashlib.md5(s.encode()).hexdigest()[0:9] + class TrackbackSite(db.Model): """Model for sites that submit trackbacks to arXiv.""" @@ -293,6 +334,124 @@ class DBLPDocumentAuthor(db.Model): document = relationship('Document') +class Category(db.Model): + """Model for category in taxonomy.""" + + __tablename__ = 'arXiv_categories' + + archive = Column(ForeignKey('arXiv_archives.archive_id'), + primary_key=True, + nullable=False, server_default=text("''")) + subject_class = Column(String(16), primary_key=True, + nullable=False, server_default=text("''")) + definitive = Column(Integer, nullable=False, server_default=text("'0'")) + active = Column(Integer, nullable=False, server_default=text("'0'")) + category_name = Column(String(255)) + endorse_all = Column(Enum('y', 'n', 'd'), nullable=False, + server_default=text("'d'")) + endorse_email = Column(Enum('y', 'n', 'd'), + nullable=False, server_default=text("'d'")) + papers_to_endorse = Column( + SmallInteger, nullable=False, server_default=text("'0'")) + endorsement_domain = Column(ForeignKey( + 'arXiv_endorsement_domains.endorsement_domain'), index=True) + + arXiv_archive = relationship('Archive') + arXiv_endorsement_domain = relationship('EndorsementDomain') + + +class Archive(db.Model): + """Model for archive in taxonomy.""" + + __tablename__ = 'arXiv_archives' + + archive_id = Column(String(16), primary_key=True, + server_default=text("''")) + in_group = Column(ForeignKey('arXiv_groups.group_id'), + nullable=False, index=True, server_default=text("''")) + archive_name = Column(String(255), nullable=False, + server_default=text("''")) + start_date = Column(String(4), nullable=False, server_default=text("''")) + end_date = Column(String(4), nullable=False, server_default=text("''")) + subdivided = Column(Integer, nullable=False, server_default=text("'0'")) + + arXiv_group = relationship('Group') + + +class Group(db.Model): + """Model for group in taxonomy.""" + + __tablename__ = 'arXiv_groups' + + group_id = Column(String(16), primary_key=True, server_default=text("''")) + group_name = Column(String(255), nullable=False, server_default=text("''")) + start_year = Column(String(4), nullable=False, server_default=text("''")) + + +class EndorsementDomain(db.Model): + """Model for endorsement domain.""" + + __tablename__ = 'arXiv_endorsement_domains' + + endorsement_domain = Column( + String(32), primary_key=True, server_default=text("''")) + endorse_all = Column(Enum('y', 'n'), nullable=False, + server_default=text("'n'")) + mods_endorse_all = Column( + Enum('y', 'n'), nullable=False, server_default=text("'n'")) + endorse_email = Column(Enum('y', 'n'), nullable=False, + server_default=text("'y'")) + papers_to_endorse = Column( + SmallInteger, nullable=False, server_default=text("'4'")) + + +in_category = Table( + 'arXiv_in_category', metadata, + Column('document_id', ForeignKey('arXiv_documents.document_id'), + nullable=False, index=True, server_default=text("'0'")), + Column('archive', String(16), nullable=False, server_default=text("''")), + Column('subject_class', String(16), + nullable=False, server_default=text("''")), + Column('is_primary', Integer, nullable=False, server_default=text("'0'")), + ForeignKeyConstraint(['archive', 'subject_class'], [ + 'arXiv_categories.archive', + 'arXiv_categories.subject_class']), + Index('archive', 'archive', 'subject_class', 'document_id', unique=True), + Index('arXiv_in_category_mp', 'archive', 'subject_class') +) + + +class StatsMonthlyDownload(db.Model): + """Model for monthly article download statistics.""" + + __tablename__ = 'arXiv_stats_monthly_downloads' + + ym = Column(Date, primary_key=True) + downloads = Column(Integer, nullable=False) + + +class StatsMonthlySubmission(db.Model): + """Model for monthly submission statistics.""" + + __tablename__ = 'arXiv_stats_monthly_submissions' + + ym = Column(Date, primary_key=True, + server_default=text("'0000-00-00'")) + num_submissions = Column(SmallInteger, nullable=False) + historical_delta = Column(Integer, nullable=False, + server_default=text("'0'")) + + +stats_hourly = Table( + 'arXiv_stats_hourly', metadata, + Column('ymd', Date, nullable=False, index=True), + Column('hour', Integer, nullable=False, index=True), + Column('node_num', Integer, nullable=False, index=True), + Column('access_type', String(1), nullable=False, index=True), + Column('connections', Integer, nullable=False) +) + + def init_app(app: Optional[LocalProxy]) -> None: """Set configuration defaults and attach session to the application.""" db.init_app(app) diff --git a/browse/services/document/author_affil.py b/browse/services/document/author_affil.py deleted file mode 100644 index 356d4c135..000000000 --- a/browse/services/document/author_affil.py +++ /dev/null @@ -1,363 +0,0 @@ -"""Parse Authors lines to extract author and affiliation data.""" -import re -from itertools import dropwhile -from typing import Dict, Iterator, List, Tuple - -from browse.services.util.tex2utf import tex2utf - -PREFIX_MATCH = 'van|der|de|la|von|del|della|da|mac|ter|dem|di|vaziri' - -""" -Takes data from an Author: line in the current arXiv abstract -file and returns a structured set of data: - - author_list_ptr = [ - [ author1_keyname, author1_firstnames, author1_suffix, affil1, affil2 ] , - [ author2_keyname, author2_firstnames, author1_suffix, affil1 ] , - [ author3_keyname, author3_firstnames, author1_suffix ] - ] - -Abstracted from Dienst software for OAI1 and other uses. This -routine should just go away when a better metadata structure is -adopted that deals with names and affiliations properly. - -Must remember that there is at least one person one the archive -who has only one name, this should clearly be considered the key name. - -Code originally written by Christina Scovel, Simeon Warner Dec99/Jan00 - 2000-10-16 - separated. - 2000-12-07 - added support for suffix - 2003-02-14 - get surname prefixes from arXiv::Filters::Index [Simeon] - 2007-10-01 - created test script, some tidying [Simeon] - 2018-05-25 - Translated from Perl to Python [Brian C.] -""" - - -def parse_author_affil(authors: str) -> List[List[str]]: - """Parse author line and returns an list of author and affiliation data. - - The list for each author will have at least three elements for - keyname, firstname(s) and suffix. The keyname will always have content - but the other strings might be empty strings if there is no firstname - or suffix. Any additional elements after the first three are affiliations, - there may be zero or more. - - Handling of prefix "XX collaboration" etc. is duplicated here and in - arXiv::HTML::AuthorLink -- it shouldn't be. Likely should just be here. - - This routine is just a wrapper around the two parts that first split - the authors line into parts, and then back propagate the affiliations. - The first part is to be used along for display where we do not want - to back propagate affiliation information. - - :param authors: string of authors from abs file or similar - :return: - Returns a structured set of data: - author_list_ptr = [ - [ author1_keyname, author1_firstnames, author1_suffix, affil1, affil2 ], - [ author2_keyname, author2_firstnames, author1_suffix, affil1 ] , - [ author3_keyname, author3_firstnames, author1_suffix ] - ] - """ - return _parse_author_affil_back_propagate( - **_parse_author_affil_split(authors)) - - -def _parse_author_affil_split(author_line: str) -> Dict: - """ - Split author line into author and affiliation data. - - Take author line, tidy spacing and punctuation, and then split up into - individual author an affiliation data. Has special cases to avoid splitting - an initial collaboration name and records in $back_propagate_affiliation_to - the fact that affiliations should not be back propagated to collaboration - names. - - Does not handle multiple collaboration names. - """ - if not author_line: - return {'author_list': [], 'back_prop': 0} - - names: List[str] = split_authors(author_line) - if not names: - return {'author_list': [], 'back_prop': 0} - - names = _remove_double_commas(names) - # get rid of commas at back - namesIter: Iterator[str] = reversed( - list(dropwhile(lambda x: x == ',', reversed(names)))) - # get rid of commas at front - names = list(dropwhile(lambda x: x == ',', namesIter)) - - # Extract all names (all parts not starting with comma or paren) - names = list(map(_tidy_name, filter( - lambda x: re.match('^[^](,]', x), names))) - names = list(filter(lambda n: not re.match( - r'^\s*et\.?\s+al\.?\s*', n, flags=re.IGNORECASE), names)) - - (names, author_list, - back_propagate_affiliations_to) = _collaboration_at_start(names) - - (enumaffils) = _enum_collaboration_at_end(author_line) - - # Split name into keyname and firstnames/initials. - # Deal with different patterns in turn: prefixes, suffixes, plain - # and single name. - patterns = [('double-prefix', - r'^(.*)\s+(' + PREFIX_MATCH + r')\s(' + - PREFIX_MATCH + r')\s(\S+)$'), - ('name-prefix-name', - r'^(.*)\s+(' + PREFIX_MATCH + r')\s(\S+)$'), - ('name-name-prefix', - r'^(.*)\s+(\S+)\s(I|II|III|IV|V|Sr|Jr|Sr\.|Jr\.)$'), - ('name-name', - r'^(.*)\s+(\S+)$'), ] - - # Now go through names in turn and try to get affiliations - # to go with them - for name in names: - pattern_matches = ((mtype, re.match(m, name, flags=re.IGNORECASE)) - for (mtype, m) in patterns) - - (mtype, match) = next(((mtype, m) - for (mtype, m) in pattern_matches - if m is not None), ('default', None)) - if match is None: - author_entry = [name, '', ''] - elif mtype == 'double-prefix': - s = '{} {} {}'.format(match.group( - 2), match.group(3), match.group(4)) - author_entry = [s, match.group(1), ''] - elif mtype == 'name-prefix-name': - s = '{} {}'.format(match.group(2), match.group(3)) - author_entry = [s, match.group(1), ''] - elif mtype == 'name-name-prefix': - author_entry = [match.group(2), match.group(1), match.group(3)] - elif mtype == 'name-name': - author_entry = [match.group(2), match.group(1), ''] - else: - author_entry = [name, '', ''] - - # search back in author_line for affiliation - author_entry = _add_affiliation( - author_line, enumaffils, author_entry, name) - author_list.append(author_entry) - - return {'author_list': author_list, - 'back_prop': back_propagate_affiliations_to} - - -def parse_author_affil_utf(authors: str) -> List: - """ - Call parse_author_affil() and do TeX to UTF conversion. - - Output structure is the same but should be in UTF and not TeX. - """ - if not authors: - return [] - return list(map(lambda author: list(map(tex2utf, author)), - parse_author_affil(authors))) - - -def _remove_double_commas(items: List[str]) -> List[str]: - - parts: List[str] = [] - last = '' - for pt in items: - if pt == ',' and last == ',': - continue - else: - parts.append(pt) - last = pt - return parts - - -def _tidy_name(name: str) -> str: - name = re.sub(r'\s\s+', ' ', name) # also gets rid of CR - # add space after dot (except in TeX) - name = re.sub(r'(?', name) - return name - - -def _collaboration_at_start(names: List[str]) \ - -> Tuple[List[str], List[List[str]], int]: - """Perform special handling of collaboration at start.""" - author_list = [] - - back_propagate_affiliations_to = 0 - while len(names) > 0: - m = re.search(r'([a-z0-9\s]+\s+(collaboration|group|team))', - names[0], flags=re.IGNORECASE) - if not m: - break - - # Add to author list - author_list.append([m.group(1), '', '']) - back_propagate_affiliations_to += 1 - # Remove from names - names.pop(0) - # Also swallow and following comma or colon - if names and (names[0] == ',' or names[0] == ':'): - names.pop(0) - - return names, author_list, back_propagate_affiliations_to - - -def _enum_collaboration_at_end(author_line: str)->Dict: - """Get separate set of enumerated affiliations from end of author_line.""" - # Now see if we have a separate set of enumerated affiliations - # This is indicated by finding '(\s*(' - line_m = re.search(r'\(\s*\((.*)$', author_line) - if not line_m: - return {} - - enumaffils = {} - affils = re.sub(r'\s*\)\s*$', '', line_m.group(1)) - - # Now expect to have '1) affil1 (2) affil2 (3) affil3' - for affil in affils.split('('): - # Now expect `1) affil1 ', discard if no match - m = re.match(r'^(\d+)\)\s*(\S.*\S)\s*$', affil) - if m: - enumaffils[m.group(1)] = re.sub(r'[\.,\s]*$', '', m.group(2)) - - return enumaffils - - -def _add_affiliation(author_line: str, - enumaffils: Dict, - author_entry: List[str], - name: str) -> List: - """ - Add author affiliation to author_entry if one is found in author_line. - - This should deal with these cases - Smith B(labX) Smith B(1) Smith B(1, 2) Smith B(1 & 2) Smith B(1 and 2) - """ - en = re.escape(name) - namerex = r'{}\s*\(([^\(\)]+)'.format(en.replace(' ', 's*')) - m = re.search(namerex, author_line, flags=re.IGNORECASE) - if not m: - return author_entry - - # Now see if we have enumerated references (just commas, digits, &, and) - affils = m.group(1).rstrip().lstrip() - affils = re.sub(r'(&|and)/,', ',', affils, flags=re.IGNORECASE) - - if re.match(r'^[\d,\s]+$', affils): - for affil in affils.split(','): - if affil in enumaffils: - author_entry.append(enumaffils[affil]) - else: - author_entry.append(affils) - - return author_entry - - -def _parse_author_affil_back_propagate(author_list: List[List[str]], - back_prop: int) -> List[List[str]]: - """Back propagate author affiliation. - - Take the author list structure generated by parse_author_affil_split(..) - and propagate affiliation information backwards to preceeding author - entries where none was give. Stop before entry $back_prop to avoid - adding affiliation information to collaboration names. - - given, eg: - a.b.first, c.d.second (affil) - implies - a.b.first (affil), c.d.second (affil) - and in more complex cases: - a.b.first, c.d.second (1), e.f.third, g.h.forth (2,3) - implies - a.b.first (1), c.d.second (1), e.f.third (2,3), g.h.forth (2,3) - """ - last_affil: List[str] = [] - for x in range(len(author_list) - 1, max(back_prop - 1, -1), -1): - author_entry = author_list[x] - if len(author_entry) > 3: # author has affiliation,store - last_affil = author_entry - elif last_affil: - # author doesn't have affil but later one did => copy - author_entry.extend(last_affil[3:]) - - return author_list - - -def split_authors(authors: str) -> List: - """ - Split author string into authors entity lists. - - Take an author line as a string and return a reference to a list of the - different name and affiliation blocks. While this does normalize spacing - and 'and', it is a key feature that the set of strings returned can be - concatenated to reproduce the original authors line. This code thus - provides a very graceful degredation for badly formatted authors lines, as - the text at least shows up. - """ - # split authors field into blocks with boundaries of ( and ) - if not authors: - return [] - aus = re.split(r'(\(|\))', authors) - aus = list(filter(lambda x: x != '', aus)) - - blocks = [] - if len(aus) == 1: - blocks.append(authors) - else: - c = '' - depth = 0 - for bit in aus: - if bit == '': - continue - if bit == '(': # track open parentheses - depth += 1 - if depth == 1: - blocks.append(c) - c = '(' - else: - c = c + bit - elif bit == ')': # track close parentheses - depth -= 1 - c = c + bit - if depth == 0: - blocks.append(c) - c = '' - else: # haven't closed, so keep accumulating - continue - else: - c = c + bit - if c: - blocks.append(c) - - listx = [] - - for block in blocks: - block = re.sub(r'\s+', ' ', block) - if re.match(r'^\(', block): # it is a comment - listx.append(block) - else: # it is a name - block = re.sub(r',?\s+(and|\&)\s', ',', block) - names = re.split(r'(,|:)\s*', block) - for name in names: - if not name: - continue - name = name.rstrip().lstrip() - if name: - listx.append(name) - - # Recombine suffixes that were separated with a comma - parts: List[str] = [] - for p in listx: - if re.match(r'^(Jr\.?|Sr\.?\[IV]{2,})$', p) \ - and len(parts) >= 2 \ - and parts[-1] == ',' \ - and not re.match(r'\)$', parts[-2]): - separator = parts.pop() - last = parts.pop() - recomb = "{}{} {}".format(last, separator, p) - parts.append(recomb) - else: - parts.append(p) - - return parts diff --git a/browse/services/document/metadata.py b/browse/services/document/metadata.py index e7a2765a6..6477985ec 100644 --- a/browse/services/document/metadata.py +++ b/browse/services/document/metadata.py @@ -107,8 +107,7 @@ def __init__(self, latest_versions_path: str, self.original_versions_path = os.path.realpath(original_versions_path) def get_abs(self, arxiv_id: str) -> DocMetadata: - """ - Get the .abs metadata for the specified arXiv paper identifier. + """Get the .abs metadata for the specified arXiv paper identifier. Parameters ---------- @@ -118,7 +117,6 @@ def get_abs(self, arxiv_id: str) -> DocMetadata: Returns ------- :class:`DocMetadata` - """ paper_id = Identifier(arxiv_id=arxiv_id) @@ -156,8 +154,7 @@ def get_abs(self, arxiv_id: str) -> DocMetadata: return combined_version def _next_id(self, identifier: Identifier) -> Optional['Identifier']: - """ - Get next consecutive Identifier relative to the provided Identifier. + """Get next consecutive Identifier relative to the provided Identifier. Parameters ---------- @@ -167,7 +164,6 @@ def _next_id(self, identifier: Identifier) -> Optional['Identifier']: ------- :class:`Identifier` The next Indentifier in sequence - """ next_id = None if identifier.year is not None and \ @@ -234,8 +230,7 @@ def _next_yymm_id(self, identifier: Identifier) -> Optional[Identifier]: return None def get_next_id(self, identifier: Identifier) -> Optional['Identifier']: - """ - Get the next identifier in sequence if it exists in the repository. + """Get the next identifier in sequence if it exists in the repository. Under certain conditions this is called to generate the "next" link in the "browse context" portion of the abs page rendering. @@ -252,7 +247,6 @@ def get_next_id(self, identifier: Identifier) -> Optional['Identifier']: ------- :class:`Identifier` The next identifier in sequence that exists in the repository. - """ next_id = self._next_id(identifier) if not next_id: @@ -275,8 +269,7 @@ def get_next_id(self, identifier: Identifier) -> Optional['Identifier']: return None def _previous_id(self, identifier: Identifier) -> Optional['Identifier']: - """ - Get previous consecutive Identifier relative to provided Identifier. + """Get previous consecutive Identifier relative to provided Identifier. Parameters ---------- @@ -286,7 +279,6 @@ def _previous_id(self, identifier: Identifier) -> Optional['Identifier']: ------- :class:`Identifier` The previous Indentifier in sequence - """ previous_id = None if identifier.year is not None and \ @@ -325,8 +317,7 @@ def _previous_id(self, identifier: Identifier) -> Optional['Identifier']: return None def get_previous_id(self, identifier: Identifier) -> Optional[Identifier]: - """ - Get the previous identifier in sequence if it exists in the repository. + """Get previous identifier in sequence if it exists in repository. Under certain conditions this is called to generate the "previous" link in the "browse context" portion of the abs page rendering. @@ -343,7 +334,6 @@ def get_previous_id(self, identifier: Identifier) -> Optional[Identifier]: ------- :class:`Identifier` The previous identifier in sequence that exists in the repository. - """ previous_id = self._previous_id(identifier) if not previous_id: @@ -398,8 +388,7 @@ def get_dissemination_formats(self, docmeta: DocMetadata, format_pref: Optional[str] = None, add_sciencewise: bool = False) -> List[str]: - """ - Get a list of formats that can be disseminated for this DocMetadata. + """Get a list of formats that can be disseminated for this DocMetadata. Several checks are performed to determine available dissemination formats: @@ -424,7 +413,6 @@ def get_dissemination_formats(self, ------- List[str] A list of format strings. - """ formats: List[str] = [] @@ -556,17 +544,15 @@ def parse_abs_file(filename: str) -> DocMetadata: if 'categories' in fields and fields['categories']: category_list = fields['categories'].split() - if category_list[0] in taxonomy.definitions.CATEGORIES: - primary_category = Category(id=category_list[0]) + if category_list[0] in taxonomy.CATEGORIES: + primary_category = Category(category_list[0]) primary_archive = \ Archive( - id=taxonomy.definitions.CATEGORIES[primary_category.id]['in_archive']) + taxonomy.CATEGORIES[primary_category.id]['in_archive']) elif arxiv_identifier.is_old_id: - primary_archive = \ - Archive(id=arxiv_identifier.archive) # type: ignore + primary_archive = Archive(arxiv_identifier.archive) elif arxiv_identifier.is_old_id: - primary_archive = \ - Archive(id=arxiv_identifier.archive) # type: ignore + primary_archive = Archive(arxiv_identifier.archive) else: raise AbsException('Cannot infer archive from identifier.') @@ -588,9 +574,9 @@ def parse_abs_file(filename: str) -> DocMetadata: primary_category=primary_category, primary_archive=primary_archive, primary_group=Group( - id=taxonomy.definitions.ARCHIVES[primary_archive.id]['in_group']), + taxonomy.ARCHIVES[primary_archive.id]['in_group']), secondary_categories=[ - Category(id=x) for x in category_list[1:] + Category(x) for x in category_list[1:] if (category_list and len(category_list) > 1) ], journal_ref=None if 'journal_ref' not in fields diff --git a/browse/services/listing/__init__.py b/browse/services/listing/__init__.py new file mode 100644 index 000000000..b815b397e --- /dev/null +++ b/browse/services/listing/__init__.py @@ -0,0 +1,104 @@ +"""Serves lists of articles for categories and time periods. + +Currently (2018-10) getting everything for a listing from the DB is +not possible. There is no table that correctly records the publish +history in the legacy DB. + +The legacy listing files are used only for the IDs of the papers +announced. The rest of the metadata is not kept updated. An example of +this causing a problem is if an article published on 2018-01-01, then +crossed on 2018-01-02, then replaced with a differnt title on +2018-01-03. The cross on 2018-01-02 in the listing file will have the +old title. + +Why month granularity? The legacy listing files have only month +granularity for when a paper was announced. In the future there might +be better date granularity for new papers. +""" + +from typing import cast, Optional, Any + +from flask import g + +from browse.domain.listing import NewResponse, ListingResponse, ListingCountResponse + + +class ListingService: + """Class for arXiv document listings.""" + + @classmethod + def version(cls) -> str: + """Version.""" + return "0.2" + + def list_articles_by_year(self, + archiveOrCategory: str, + year: int, + skip: int, + show: int, + if_modified_since: Optional[str] = None) -> ListingResponse: + """Get listing items for a whole year. + + if_modified_since is the if_modified_since header value passed by the web client + It should be in RFC 1123 format. + """ + raise NotImplementedError + + def list_articles_by_month(self, + archiveOrCategory: str, + year: int, + month: int, + skip: int, + show: int, + if_modified_since: Optional[str] = None) -> ListingResponse: + """Get listings for a month. + + if_modified_since is the if_modified_since header value passed by the web client + It should be in RFC 1123 format. + """ + raise NotImplementedError + + def list_new_articles(self, + archiveOrCategory: str, + skip: int, + show: int, + if_modified_since: Optional[str] = None) -> NewResponse: + """Gets listings for the most recent announcement/publish. + + if_modified_since is the if_modified_since header value passed by the web client + It should be in RFC 1123 format. + """ + raise NotImplementedError + + def list_pastweek_articles(self, + archiveOrCategory: str, + skip: int, + show: int, + if_modified_since: Optional[str] = None) -> ListingResponse: + """Gets listings for the 5 most recent announcement/publish. + + if_modified_since is the if_modified_since header value passed by the web client + It should be in RFC 1123 format. + """ + raise NotImplementedError + + def monthly_counts(self, + archive: str, + year: int) -> ListingCountResponse: + """Gets monthly listing counts for the year.""" + raise NotImplementedError + + + +def get_listing_service() -> ListingService: + """Get the listing service. + + There is probably a better way to do this. + """ + if 'listing_service' not in g: + # importing at runtime to avoid cyclic imports that kill python + import importlib + fl = importlib.import_module("browse.services.listing.fake_listings") + g.listing_service = fl.FakeListingFilesService() + + return cast(ListingService, g.listing_service) diff --git a/browse/services/listing/fake_listings.py b/browse/services/listing/fake_listings.py new file mode 100644 index 000000000..bb675ae8c --- /dev/null +++ b/browse/services/listing/fake_listings.py @@ -0,0 +1,304 @@ +"""Example of a listing service for testing.""" + +# pylint: skip-file + +from typing import List, Optional + +from browse.domain.listing import NewResponse, ListingResponse, ListingItem, \ + ListingCountResponse +from browse.services.listing import ListingService +import datetime + + +class FakeListingFilesService(ListingService): + """Listing service used for development and testing purposes. + + This is intended as an example of what the /listing controller + needs for methods from a listing service. + + This just returns examples that should be good enough. This makes + no attempt to return the correct articles for a date or them correct + primarys for articles. + """ + + def monthly_counts(self, archive: str, year:int) -> ListingCountResponse: + """Example of monthly_counts.""" + counts = [ + {'year': year, 'month': 1, 'new': 1234, 'cross': 234}, + {'year': year, 'month': 2, 'new': 1224, 'cross': 134}, + {'year': year, 'month': 3, 'new': 1334, 'cross': 324}, + {'year': year, 'month': 4, 'new': 1534, 'cross': 134}, + {'year': year, 'month': 5, 'new': 1644, 'cross': 234}, + {'year': year, 'month': 6, 'new': 983, 'cross': 314}, + {'year': year, 'month': 7, 'new': 876, 'cross': 132}, + {'year': year, 'month': 8, 'new': 1233, 'cross': 294}, + {'year': year, 'month': 9, 'new': 1453, 'cross': 273}, + {'year': year, 'month': 10, 'new': 1502, 'cross': 120}, + {'year': year, 'month': 11, 'new': 1638, 'cross': 100}, + {'year': year, 'month': 12, 'new': 1601, 'cross': 233}, + ] + return {'month_counts': counts, #type: ignore + 'new_count': sum([mm['new'] for mm in counts]), + 'cross_count': sum([mm['cross'] for mm in counts])} + + def list_new_articles(self, + archiveOrCategory: str, + skip: int, + show: int, + if_modified_since: Optional[str] = None) -> NewResponse: + """Example of list_new_articles.""" + listings = [ + '0704.0526', '0704.0988', '0704.0182', '0704.0310', '0704.0616', '0704.0732', '0704.0042', + '0704.0615', '0704.0568', '0704.0319', '0704.0265', '0704.0133', '0704.0533', '0704.0453', + '0704.0276', '0704.0991', '0704.0740', '0704.0473', '0704.0083', '0704.0278', '0704.0006', + '0704.0735', '0704.0753', '0704.0324', '0704.0600', '0704.0737', '0704.0387', '0704.0659', + '0704.0432', '0704.0408', '0704.0895', '0704.0088', '0704.0719', '0704.0124', '0704.0508'] + + items2 = [{'id': id, 'listingType': 'new', 'primary': 'cs.DB'} + for id in listings] + new_count = len(items2) + + items3 = [ + {'id': '0704.0145', 'listingType': 'cross', 'primary': 'cs.DL'}, + {'id': '0704.0075', 'listingType': 'cross', 'primary': 'cs.GT'}, + {'id': '0704.0333', 'listingType': 'cross', 'primary': 'cs.NA'}, + {'id': '0704.0445', 'listingType': 'cross', 'primary': 'cs.NE'}, + {'id': '0704.0226', 'listingType': 'cross', 'primary': 'cs.NA'}, + {'id': '0704.0266', 'listingType': 'cross', 'primary': 'cs.GT'}, + {'id': '0704.0368', 'listingType': 'cross', 'primary': 'cs.CV'}, + {'id': '0704.0716', 'listingType': 'cross', 'primary': 'cs.DL'}, + {'id': '0704.0373', 'listingType': 'cross', 'primary': 'cs.DL'}, + {'id': '0704.0378', 'listingType': 'cross', 'primary': 'cs.CV'}, + {'id': '0704.0536', 'listingType': 'cross', 'primary': 'cs.DL'}, + {'id': '0704.0239', 'listingType': 'cross', 'primary': 'cs.DL'}, + {'id': '0704.0209', 'listingType': 'cross', 'primary': 'cs.GT'}, + {'id': '0704.0916', 'listingType': 'cross', 'primary': 'cs.DL'}, + ] + + items4 = [ + {'id': '0704.0091', 'listingType': 'rep'}, { + 'id': '0704.0054', 'listingType': 'rep'}, + {'id': '0704.0225', 'listingType': 'rep'}, { + 'id': '0704.0186', 'listingType': 'rep'}, + {'id': '0704.0847', 'listingType': 'rep'}, { + 'id': '0704.0129', 'listingType': 'rep'}, + {'id': '0704.0257', 'listingType': 'rep'}, {'id': '0704.0481', 'listingType': 'rep'}] + + lstgs: List[ListingItem] = items2 + items3 + items4 # type: ignore + + return {'listings': lstgs[skip:skip + show], + 'announced': datetime.date(2007, 4, 1), + 'submitted': (datetime.date(2007, 3, 30), datetime.date(2007, 4, 1)), + 'new_count': len(items2), + 'cross_count': len(items3), + 'rep_count': len(items4), + 'expires': 'Wed, 21 Oct 2015 07:28:00 GMT' + } + + def list_pastweek_articles(self, + archiveOrCategory: str, + skip: int, + show: int, + if_modified_since: Optional[str] = None) -> ListingResponse: + """Examlpe of list_pastweek_articles.""" + listings = [ + '0704.0526', '0704.0988', '0704.0182', '0704.0310', '0704.0616', '0704.0732', '0704.0042', + '0704.0615', '0704.0568', '0704.0319', '0704.0265', '0704.0133', '0704.0533', '0704.0453', + '0704.0276', '0704.0991', '0704.0740', '0704.0473', '0704.0083', '0704.0278', '0704.0006', + '0704.0735', '0704.0753', '0704.0324', '0704.0600', '0704.0737', '0704.0387', '0704.0659', + '0704.0432', '0704.0408', '0704.0895', '0704.0088', '0704.0719', '0704.0124', '0704.0508'] + items2: List[ListingItem] = [ + {'id': id, 'listingType': 'new', 'primary': 'cs.DB'} for id in listings] + + # These dates are faked + daysize = 7 + pd1 = (datetime.date(2007, 4, 2), 0) + pd2 = (datetime.date(2007, 4, 3), daysize - 1) + pd3 = (datetime.date(2007, 4, 4), daysize * 2 - 1) + pd4 = (datetime.date(2007, 4, 5), daysize * 3 - 1) + pd5 = (datetime.date(2007, 4, 6), daysize * 4 - 1) + + return {'listings': items2, + 'pubdates': [pd1, pd2, pd3, pd4, pd5], + 'count': len(listings), + 'expires': 'Wed, 21 Oct 2015 07:28:00 GMT' + } + + def list_articles_by_year(self, + archiveOrCategory: str, + year: int, + skip: int, + show: int, + if_modified_since: Optional[str] = None) -> ListingResponse: + """Examlpe of list_articles_by_year.""" + return self.list_articles_by_month(archiveOrCategory, year, 1, skip, show, if_modified_since) + + def list_articles_by_month(self, + archiveOrCategory: str, + year: int, + month: int, + skip: int, + show: int, + if_modified_since: Optional[str] = None) -> ListingResponse: + """Example of list_articles_by_month.""" + if 'skip' not in vars(): + skip = 0 + if 'show' not in vars(): + show = 25 + + pd = datetime.date(2007, 4, 2) + + items2: List[ListingItem] = [{'id': id, 'listingType': 'new', 'primary': 'cs.DB'} + for id in k_listings[skip:skip + show]] + + return {'listings': items2, + 'pubdates': [(pd, len(k_listings))], + 'count': len(k_listings), + 'expires': 'Wed, 21 Oct 2015 07:28:00 GMT' + } + + +k_listings = [ + '0704.0526', '0704.0988', '0704.0182', '0704.0310', '0704.0616', '0704.0732', '0704.0042', + '0704.0615', '0704.0568', '0704.0319', '0704.0265', '0704.0133', '0704.0533', '0704.0453', + '0704.0276', '0704.0991', '0704.0740', '0704.0473', '0704.0083', '0704.0278', '0704.0006', + '0704.0735', '0704.0753', '0704.0324', '0704.0600', '0704.0737', '0704.0387', '0704.0659', + '0704.0432', '0704.0408', '0704.0895', '0704.0088', '0704.0719', '0704.0124', '0704.0508', + '0704.0145', '0704.0075', '0704.0333', '0704.0445', '0704.0226', '0704.0266', '0704.0368', + '0704.0716', '0704.0373', '0704.0378', '0704.0536', '0704.0239', '0704.0209', '0704.0916', + '0704.0091', '0704.0054', '0704.0225', '0704.0186', '0704.0847', '0704.0129', '0704.0257', + '0704.0388', '0704.0481', '0704.0156', '0704.0685', '0704.0694', '0704.0485', '0704.0682', + '0704.0200', '0704.0627', '0704.0722', '0704.0845', '0704.0815', '0704.0362', '0704.0143', + '0704.0381', '0704.0299', '0704.0205', '0704.0914', '0704.0640', '0704.0683', '0704.0238', + '0704.0939', '0704.0582', '0704.0019', '0704.0958', '0704.0150', '0704.0699', '0704.0306', + '0704.0418', '0704.0463', '0704.0002', '0704.0975', '0704.0787', '0704.0597', '0704.0154', + '0704.0178', '0704.0572', '0704.0576', '0704.0757', '0704.0457', '0704.0751', '0704.0414', + '0704.0355', '0704.0349', '0704.0161', '0704.0748', '0704.0980', '0704.0016', '0704.0905', + '0704.0596', '0704.0730', '0704.0543', '0704.0070', '0704.0898', '0704.0273', '0704.0480', + '0704.0810', '0704.0440', '0704.0025', '0704.0361', '0704.0936', '0704.0770', '0704.0612', + '0704.0255', '0704.0109', '0704.0625', '0704.0010', '0704.0015', '0704.0458', '0704.0856', + '0704.0359', '0704.0406', '0704.0462', '0704.0614', '0704.0983', '0704.0142', '0704.0013', + '0704.0578', '0704.0820', '0704.0477', '0704.0583', '0704.0111', '0704.0495', '0704.0282', + '0704.0986', '0704.0586', '0704.0311', '0704.0700', '0704.0638', '0704.0206', '0704.0808', + '0704.0389', '0704.0904', '0704.0105', '0704.0433', '0704.0434', '0704.0943', '0704.0422', + '0704.0180', '0704.0199', '0704.0766', '0704.0077', '0704.0788', '0704.0170', '0704.0603', + '0704.0844', '0704.0995', '0704.0948', '0704.0231', '0704.0103', '0704.0650', '0704.0944', + '0704.0591', '0704.0404', '0704.0858', '0704.0956', '0704.0454', '0704.0676', '0704.0144', + '0704.0190', '0704.0868', '0704.0363', '0704.0763', '0704.0026', '0704.0553', '0704.0104', + '0704.0185', '0704.0575', '0704.0017', '0704.0702', '0704.0169', '0704.0632', '0704.0671', + '0704.0177', '0704.0792', '0704.0849', '0704.0079', '0704.0559', '0704.0221', '0704.0896', + '0704.0677', '0704.0938', '0704.0356', '0704.0035', '0704.0585', '0704.0254', '0704.0880', + '0704.0537', '0704.0581', '0704.0646', '0704.0837', '0704.0681', '0704.0809', '0704.0761', + '0704.0642', '0704.0566', '0704.0323', '0704.0891', '0704.0775', '0704.0240', '0704.0475', + '0704.0262', '0704.0271', '0704.0023', '0704.0784', '0704.0528', '0704.0392', '0704.0869', + '0704.0797', '0704.0673', '0704.0078', '0704.0790', '0704.0300', '0704.0569', '0704.0466', + '0704.0555', '0704.0486', '0704.0365', '0704.0413', '0704.0181', '0704.0374', '0704.0018', + '0704.0184', '0704.0758', '0704.0256', '0704.0386', '0704.0217', '0704.0224', '0704.0045', + '0704.0573', '0704.0417', '0704.0663', '0704.0796', '0704.0253', '0704.0401', '0704.0126', + '0704.0014', '0704.0229', '0704.0996', '0704.0046', '0704.0747', '0704.0656', '0704.0653', + '0704.0274', '0704.0806', '0704.0216', '0704.0590', '0704.0309', '0704.0069', '0704.0873', + '0704.0826', '0704.0135', '0704.0438', '0704.0421', '0704.0709', '0704.0455', '0704.0931', + '0704.0328', '0704.0342', '0704.0967', '0704.0976', '0704.0604', '0704.0691', '0704.0020', + '0704.0981', '0704.0482', '0704.0889', '0704.0029', '0704.0689', '0704.0041', '0704.0538', + '0704.0860', '0704.0219', '0704.0963', '0704.0424', '0704.0918', '0704.0260', '0704.0613', + '0704.0106', '0704.0467', '0704.0658', '0704.0701', '0704.0123', '0704.0259', '0704.0153', + '0704.0379', '0704.0203', '0704.0158', '0704.0364', '0704.0520', '0704.0875', '0704.0913', + '0704.0094', '0704.0004', '0704.0977', '0704.0964', '0704.0280', '0704.0383', '0704.0817', + '0704.0426', '0704.0565', '0704.0439', '0704.0084', '0704.0675', '0704.0067', '0704.0704', + '0704.0119', '0704.0781', '0704.0049', '0704.0628', '0704.0074', '0704.0377', '0704.0588', + '0704.0196', '0704.0057', '0704.0416', '0704.0804', '0704.0828', '0704.0192', '0704.0649', + '0704.0452', '0704.0402', '0704.0657', '0704.0073', '0704.0972', '0704.0919', '0704.0786', + '0704.0420', '0704.0288', '0704.0080', '0704.0687', '0704.0448', '0704.0664', '0704.0742', + '0704.0236', '0704.0647', '0704.0881', '0704.0354', '0704.0295', '0704.0780', '0704.0052', + '0704.0794', '0704.0771', '0704.0497', '0704.0631', '0704.0863', '0704.0114', '0704.0030', + '0704.0987', '0704.0312', '0704.0971', '0704.0344', '0704.0556', '0704.0012', '0704.0955', + '0704.0782', '0704.0836', '0704.0261', '0704.0430', '0704.0304', '0704.0063', '0704.0696', + '0704.0593', '0704.0965', '0704.0643', '0704.0241', '0704.0928', '0704.0864', '0704.0053', + '0704.0773', '0704.0510', '0704.0879', '0704.0811', '0704.0038', '0704.0444', '0704.0194', + '0704.0731', '0704.0277', '0704.0036', '0704.0511', '0704.0252', '0704.0922', '0704.0530', + '0704.0945', '0704.0302', '0704.0580', '0704.0293', '0704.0227', '0704.0925', '0704.0842', + '0704.0768', '0704.0545', '0704.0097', '0704.0008', '0704.0570', '0704.0450', '0704.0007', + '0704.0684', '0704.0764', '0704.0370', '0704.0686', '0704.0552', '0704.0281', '0704.0279', + '0704.0746', '0704.0071', '0704.0831', '0704.0358', '0704.0637', '0704.0179', '0704.0957', + '0704.0244', '0704.0654', '0704.0841', '0704.0912', '0704.0315', '0704.0031', '0704.0729', + '0704.0391', '0704.0660', '0704.0117', '0704.0403', '0704.0202', '0704.0644', '0704.0819', + '0704.0855', '0704.0212', '0704.0474', '0704.0335', '0704.0110', '0704.0698', '0704.0610', + '0704.0220', '0704.0139', '0704.0973', '0704.0491', '0704.0534', '0704.0215', '0704.0321', + '0704.0409', '0704.0375', '0704.0267', '0704.0040', '0704.0648', '0704.0755', '0704.0507', + '0704.0883', '0704.0316', '0704.0608', '0704.0172', '0704.0776', '0704.0814', '0704.0527', + '0704.0242', '0704.0900', '0704.0039', '0704.0668', '0704.0589', '0704.0470', '0704.0283', + '0704.0483', '0704.0560', '0704.0284', '0704.0669', '0704.0033', '0704.0056', '0704.0713', + '0704.0307', '0704.0059', '0704.0412', '0704.0005', '0704.0937', '0704.0727', '0704.0690', + '0704.0577', '0704.0926', '0704.0887', '0704.0750', '0704.0871', '0704.0332', '0704.0222', + '0704.0385', '0704.0714', '0704.0992', '0704.0550', '0704.0602', '0704.0622', '0704.0756', + '0704.0861', '0704.0923', '0704.0840', '0704.0917', '0704.0061', '0704.0308', '0704.0390', + '0704.0852', '0704.0920', '0704.0011', '0704.0929', '0704.0739', '0704.0982', '0704.0606', + '0704.0952', '0704.0759', '0704.0134', '0704.0651', '0704.0818', '0704.0609', '0704.0272', + '0704.0120', '0704.0380', '0704.0197', '0704.0116', '0704.0962', '0704.0985', '0704.0515', + '0704.0546', '0704.0141', '0704.0058', '0704.0850', '0704.0665', '0704.0187', '0704.0151', + '0704.0661', '0704.0419', '0704.0851', '0704.0211', '0704.0953', '0704.0679', '0704.0720', + '0704.0998', '0704.0168', '0704.0296', '0704.0807', '0704.0264', '0704.0670', '0704.0127', + '0704.0825', '0704.0621', '0704.0791', '0704.0532', '0704.0496', '0704.0003', '0704.0247', + '0704.0708', '0704.0492', '0704.0574', '0704.0927', '0704.0778', '0704.0951', '0704.0128', + '0704.0372', '0704.0183', '0704.0949', '0704.0629', '0704.0915', '0704.0446', '0704.0544', + '0704.0269', '0704.0132', '0704.0218', '0704.0443', '0704.0564', '0704.0662', '0704.0490', + '0704.0326', '0704.0619', '0704.0394', '0704.0384', '0704.1001', '0704.0189', '0704.0032', + '0704.0246', '0704.0540', '0704.0099', '0704.0942', '0704.0314', '0704.0369', '0704.0460', + '0704.0655', '0704.0725', '0704.0353', '0704.0113', '0704.0890', '0704.0407', '0704.0340', + '0704.0594', '0704.0521', '0704.0947', '0704.0346', '0704.0371', '0704.0557', '0704.0865', + '0704.0138', '0704.0752', '0704.0697', '0704.0802', '0704.0095', '0704.0774', '0704.0037', + '0704.0405', '0704.0493', '0704.0399', '0704.0347', '0704.0605', '0704.0548', '0704.0624', + '0704.0089', '0704.0859', '0704.0712', '0704.0469', '0704.0044', '0704.0710', '0704.0395', + '0704.0498', '0704.0155', '0704.0693', '0704.0870', '0704.0195', '0704.0535', '0704.0234', + '0704.0513', '0704.0489', '0704.0055', '0704.0705', '0704.0472', '0704.0558', '0704.0567', + '0704.0909', '0704.0680', '0704.0946', '0704.0478', '0704.0779', '0704.0598', '0704.0886', + '0704.0519', '0704.0718', '0704.0494', '0704.0337', '0704.0345', '0704.0907', '0704.0098', + '0704.0503', '0704.0523', '0704.0249', '0704.0518', '0704.0131', '0704.0331', '0704.0882', + '0704.0862', '0704.0514', '0704.0248', '0704.0471', '0704.0799', '0704.0086', '0704.0911', + '0704.0932', '0704.0082', '0704.0745', '0704.0641', '0704.0157', '0704.0022', '0704.0504', + '0704.0198', '0704.0667', '0704.0562', '0704.0051', '0704.0213', '0704.0997', '0704.0695', + '0704.0721', '0704.0028', '0704.0979', '0704.0666', '0704.0715', '0704.0984', '0704.0592', + '0704.0313', '0704.0437', '0704.0093', '0704.0579', '0704.0343', '0704.0298', '0704.0250', + '0704.0848', '0704.0336', '0704.0885', '0704.0674', '0704.0672', '0704.0783', '0704.0549', + '0704.0633', '0704.0148', '0704.0969', '0704.0382', '0704.0339', '0704.0839', '0704.0176', + '0704.0894', '0704.0853', '0704.0048', '0704.0744', '0704.0954', '0704.0149', '0704.0101', + '0704.0163', '0704.0587', '0704.0941', '0704.0906', '0704.0723', '0704.0191', '0704.0411', + '0704.0174', '0704.0130', '0704.0329', '0704.0884', '0704.0152', '0704.0902', '0704.0769', + '0704.0160', '0704.0034', '0704.0484', '0704.0289', '0704.0122', '0704.0193', '0704.0888', + '0704.0171', '0704.0072', '0704.0789', '0704.0733', '0704.0096', '0704.0398', '0704.0827', + '0704.0275', '0704.0367', '0704.0500', '0704.0501', '0704.0066', '0704.0159', '0704.0531', + '0704.0341', '0704.0994', '0704.0136', '0704.0711', '0704.0024', '0704.0214', '0704.0966', + '0704.0021', '0704.0634', '0704.0924', '0704.0062', '0704.0232', '0704.0813', '0704.0400', + '0704.0270', '0704.0125', '0704.0547', '0704.0188', '0704.0060', '0704.0449', '0704.0167', + '0704.0734', '0704.0173', '0704.0352', '0704.0459', '0704.0903', '0704.0854', '0704.0652', + '0704.0601', '0704.0736', '0704.0561', '0704.0237', '0704.0468', '0704.0974', '0704.0506', + '0704.0857', '0704.1000', '0704.0618', '0704.0833', '0704.0290', '0704.0112', '0704.0935', + '0704.0910', '0704.0465', '0704.0940', '0704.0607', '0704.0243', '0704.0425', '0704.0047', + '0704.0517', '0704.0617', '0704.0305', '0704.0824', '0704.0741', '0704.0393', '0704.0798', + '0704.0085', '0704.0423', '0704.0456', '0704.0717', '0704.0263', '0704.0286', '0704.0505', + '0704.0001', '0704.0800', '0704.0726', '0704.0950', '0704.0027', '0704.0738', '0704.0464', + '0704.0303', '0704.0442', '0704.0933', '0704.0436', '0704.0121', '0704.0108', '0704.0294', + '0704.0223', '0704.0164', '0704.0846', '0704.0777', '0704.0301', '0704.0350', '0704.0563', + '0704.0930', '0704.0892', '0704.0692', '0704.0921', '0704.0816', '0704.0959', '0704.0623', + '0704.0050', '0704.0812', '0704.0678', '0704.0645', '0704.0318', '0704.0843', '0704.0410', + '0704.0571', '0704.0076', '0704.0897', '0704.0635', '0704.0043', '0704.0429', '0704.0706', + '0704.0901', '0704.0512', '0704.0765', '0704.0877', '0704.0584', '0704.0251', '0704.0829', + '0704.0990', '0704.0626', '0704.0525', '0704.0934', '0704.0830', '0704.0081', '0704.0874', + '0704.0823', '0704.0838', '0704.0524', '0704.0551', '0704.0834', '0704.0065', '0704.0805', + '0704.0728', '0704.0993', '0704.0207', '0704.0754', '0704.0760', '0704.0487', '0704.0291', + '0704.0140', '0704.0989', '0704.0832', '0704.0447', '0704.0327', '0704.0961', '0704.0529', + '0704.0878', '0704.0599', '0704.0115', '0704.0516', '0704.0201', '0704.0322', '0704.0165', + '0704.0688', '0704.0068', '0704.0268', '0704.0821', '0704.0908', '0704.0724', '0704.0999', + '0704.0228', '0704.0090', '0704.0107', '0704.0866', '0704.0803', '0704.0100', '0704.0461', + '0704.0509', '0704.0795', '0704.0801', '0704.0366', '0704.0317', '0704.0541', '0704.0630', + '0704.0539', '0704.0338', '0704.0175', '0704.0703', '0704.0488', '0704.0210', '0704.0707', + '0704.0431', '0704.0639', '0704.0785', '0704.0441', '0704.0204', '0704.0330', '0704.0357', + '0704.0334', '0704.0292', '0704.0064', '0704.0960', '0704.0762', '0704.0245', '0704.0872', + '0704.0970', '0704.0087', '0704.0867', '0704.0893', '0704.0620', '0704.0092', '0704.0351', + '0704.0502', '0704.0235', '0704.0978', '0704.0749', '0704.0396', '0704.0348', '0704.0772', + '0704.0137', '0704.0822', '0704.0258', '0704.0435', '0704.0233', '0704.0611', '0704.0499', + '0704.0451', '0704.0360', '0704.0636', '0704.0285', '0704.0554', '0704.0899', '0704.0147', + '0704.0427', '0704.0208', '0704.0476', '0704.0479', '0704.0230', '0704.0166', '0704.0118', + '0704.0595', '0704.0009', '0704.0743', '0704.0102', '0704.0835', '0704.0320', '0704.0162', + '0704.0428', '0704.0146', '0704.0876', '0704.0325', '0704.0542', '0704.0376', '0704.0297', + '0704.0397', '0704.0968', '0704.0415', '0704.0793', '0704.0287', '0704.0767', '0704.0522'] diff --git a/browse/services/search/search_authors.py b/browse/services/search/search_authors.py index 175a2538b..da2b60b74 100644 --- a/browse/services/search/search_authors.py +++ b/browse/services/search/search_authors.py @@ -2,8 +2,8 @@ import re from typing import List, Tuple, Union -from browse.services.util.tex2utf import tex2utf -from browse.services.document.author_affil import split_authors, PREFIX_MATCH +from arxiv.util.tex2utf import tex2utf +from arxiv.util.authors import split_authors, PREFIX_MATCH AuthorList = List[Union[str, Tuple[str, str]]] @@ -13,29 +13,29 @@ """ -def is_affiliation(item: str)-> bool: +def is_affiliation(item: str) -> bool: """Return true if a string contains an affiliation.""" return item.startswith('(') -def is_short(item: str)-> bool: +def is_short(item: str) -> bool: """Return true if the length of string is less than 4 characters long.""" return len(item) < 4 -def is_etal(item: str)-> bool: +def is_etal(item: str) -> bool: """Return true if the string contains et al.""" return re.match(r'et\.? al\.?$', item) is not None -def is_divider(item: str)-> bool: +def is_divider(item: str) -> bool: """Return true if the string contains a divider character.""" return re.match(r'^(,|:)', item) is not None def split_long_author_list( authors: AuthorList, size: int) -> Tuple[AuthorList, AuthorList, int]: - """Returns two lists, first is of size, second is the remaining authors. + """Return two lists: first is of size, second is the remaining authors. The author list has strings which are not part of the author names, but commas between them to preserve the formatting that the @@ -43,7 +43,6 @@ def split_long_author_list( This function is used to split the list base on name count, not just list element count. - """ front = [] back = [] @@ -62,8 +61,7 @@ def split_long_author_list( def queries_for_authors(authors: str) -> AuthorList: - """ - Make search service query strings for authors. + """Make search service query strings for authors. The main challenge here is that the HTML output of this should match as closely as possible the string input by the submitter. @@ -76,14 +74,15 @@ def queries_for_authors(authors: str) -> AuthorList: a colon. If a list item is a tuple, author_search_query_str will be something like - "Webb J E" which can be used to query the search service. + "Webb J E" which can be used to query the search service. name_text will be the text to put in side the tag. Such as "James E. Webb," + DO resolve tex to UTF8 in both the link and text. DON'T URL_encode, do that in template DON'T do entities, do that in template - DON'T escape utf8 for HTML, just return utf8 + DON'T escape utf8 for HTML, just return utf8 """ out: AuthorList = [] @@ -92,7 +91,7 @@ def queries_for_authors(authors: str) -> AuthorList: if is_divider(item): out.append(item + ' ') elif is_affiliation(item): - out.append(' ' + item ) + out.append(' ' + tex2utf(item)) elif is_short(item) or is_etal(item): out.append(item) else: diff --git a/browse/services/util/formats.py b/browse/services/util/formats.py index 989058802..c57009f4f 100644 --- a/browse/services/util/formats.py +++ b/browse/services/util/formats.py @@ -38,9 +38,8 @@ def formats_from_source_file_name(source_file_path: str) -> List[str]: def formats_from_source_type(source_type: str, format_pref: Optional[str] = None, cache_flag: bool = False, - add_sciencewise:bool = False) -> List[str]: - """ - Get the dissemination formats based on source type and format preference. + add_sciencewise: bool = False) -> List[str]: + """Get the dissemination formats based on source type and preference. Source file types are represented by single-character codes: I - ignore diff --git a/browse/services/util/metatags.py b/browse/services/util/metatags.py index 96b9b7325..fc7bb9b56 100644 --- a/browse/services/util/metatags.py +++ b/browse/services/util/metatags.py @@ -7,13 +7,12 @@ from flask import url_for -from browse.services.document.author_affil import parse_author_affil_utf +from arxiv.util.authors import parse_author_affil_utf from browse.domain.metadata import DocMetadata def meta_tag_metadata(metadata: DocMetadata) -> List: - """ - Return data for HTML tags as used by Google Scholar. + """Return data for HTML tags as used by Google Scholar. http://scholar.google.com/intl/en/scholar/inclusion.html. diff --git a/browse/services/util/response_headers.py b/browse/services/util/response_headers.py index 2593477a0..c6f0da8e8 100644 --- a/browse/services/util/response_headers.py +++ b/browse/services/util/response_headers.py @@ -14,8 +14,7 @@ def guess_next_update_utc(dt: datetime = datetime.now(timezone.utc)) \ -> Tuple[datetime, bool]: - """ - Make a sensible guess at earliest possible datetime of next update. + """Make a sensible guess at earliest possible datetime of next update. Guess is based on provided datetime. @@ -32,7 +31,6 @@ def guess_next_update_utc(dt: datetime = datetime.now(timezone.utc)) \ whether the provided dt is likely to coincide with a publish process, which is the APPROX_PUBLISH_DURATION window starting 20:00 on the normal publish days specified by PUBLISH_ISO_WEEKDAYS. - """ config = get_application_config() tz = gettz(config.get('ARXIV_BUSINESS_TZ', 'US/Eastern')) diff --git a/browse/services/util/tex2utf.py b/browse/services/util/tex2utf.py deleted file mode 100644 index 94475f4af..000000000 --- a/browse/services/util/tex2utf.py +++ /dev/null @@ -1,191 +0,0 @@ -"""Convert between TeX escapes and UTF8.""" -import re -from typing import Pattern, Dict, Match - -# Hash to lookup tex markup and convert to Unicode -# -# macron is line above character (overbar \= in Tex) -# caron is v-shape above (\v{ } in Tex) -# See: http://www.unicode.org/charts/ - -accents = { - # first accents with non-letter prefix, e.g. \'A - "'A": 0x00c1, "'C": 0x0106, "'E": 0x00c9, "'I": 0x00cd, - "'L": 0x0139, "'N": 0x0143, "'O": 0x00d3, "'R": 0x0154, - "'S": 0x015a, "'U": 0x00da, "'Y": 0x00dd, "'Z": 0x0179, - "'a": 0x00e1, "'c": 0x0107, "'e": 0x00e9, "'i": 0x00ed, - "'l": 0x013a, "'n": 0x0144, "'o": 0x00f3, "'r": 0x0155, - "'s": 0x015b, "'u": 0x00fa, "'y": 0x00fd, "'z": 0x017a, - '"A': 0x00c4, '"E': 0x00cb, '"I': 0x00cf, '"O': 0x00d6, - '"U': 0x00dc, '"Y': 0x0178, '"a': 0x00e4, '"e': 0x00eb, - '"i': 0x00ef, '"o': 0x00f6, '"u': 0x00fc, '"y': 0x00ff, - '.A': 0x0226, '.C': 0x010a, '.E': 0x0116, '.G': 0x0120, - '.I': 0x0130, '.O': 0x022e, '.Z': 0x017b, '.a': 0x0227, - '.c': 0x010b, '.e': 0x0117, '.g': 0x0121, '.o': 0x022f, - '.z': 0x017c, '=A': 0x0100, '=E': 0x0112, '=I': 0x012a, - '=O': 0x014c, '=U': 0x016a, '=Y': 0x0232, '=a': 0x0101, - '=e': 0x0113, '=i': 0x012b, '=o': 0x014d, '=u': 0x016b, - '=y': 0x0233, '^A': 0x00c2, '^C': 0x0108, '^E': 0x00ca, - '^G': 0x011c, '^H': 0x0124, '^I': 0x00ce, '^J': 0x0134, - '^O': 0x00d4, '^S': 0x015c, '^U': 0x00db, '^W': 0x0174, - '^Y': 0x0176, '^a': 0x00e2, '^c': 0x0109, '^e': 0x00ea, - '^g': 0x011d, '^h': 0x0125, '^i': 0x00ee, '^j': 0x0135, - '^o': 0x00f4, '^s': 0x015d, '^u': 0x00fb, '^w': 0x0175, - '^y': 0x0177, '`A': 0x00c0, '`E': 0x00c8, '`I': 0x00cc, - '`O': 0x00d2, '`U': 0x00d9, '`a': 0x00e0, '`e': 0x00e8, - '`i': 0x00ec, '`o': 0x00f2, '`u': 0x00f9, '~A': 0x00c3, - '~I': 0x0128, '~N': 0x00d1, '~O': 0x00d5, '~U': 0x0168, - '~a': 0x00e3, '~i': 0x0129, '~n': 0x00f1, '~o': 0x00f5, - '~u': 0x0169, - # and now ones with letter prefix \c{c} etc.. - 'HO': 0x0150, 'HU': 0x0170, 'Ho': 0x0151, 'Hu': 0x0171, - 'cC': 0x00c7, 'cE': 0x0228, - 'cG': 0x0122, 'cK': 0x0136, 'cL': 0x013b, 'cN': 0x0145, - 'cR': 0x0156, 'cS': 0x015e, 'cT': 0x0162, 'cc': 0x00e7, - 'ce': 0x0229, 'cg': 0x0123, 'ck': 0x0137, 'cl': 0x013c, - # Commented out due ARXIVDEV-2322 (bug reported by PG) - # 'ci' : 'i\x{0327}' = chr(0x69).ch(0x327) # i with combining cedilla - 'cn': 0x0146, 'cr': 0x0157, 'cs': 0x015f, 'ct': 0x0163, - 'kA': 0x0104, 'kE': 0x0118, 'kI': 0x012e, 'kO': 0x01ea, - 'kU': 0x0172, 'ka': 0x0105, 'ke': 0x0119, 'ki': 0x012f, - 'ko': 0x01eb, 'ku': 0x0173, 'rA': 0x00c5, 'rU': 0x016e, - 'ra': 0x00e5, 'ru': 0x016f, 'uA': 0x0102, 'uE': 0x0114, - 'uG': 0x011e, 'uI': 0x012c, 'uO': 0x014e, 'uU': 0x016c, - 'ua': 0x0103, 'ue': 0x0115, 'ug': 0x011f, - 'ui': 0x012d, 'uo': 0x014f, 'uu': 0x016d, - 'vA': 0x01cd, 'vC': 0x010c, 'vD': 0x010e, - 'vE': 0x011a, 'vG': 0x01e6, 'vH': 0x021e, 'vI': 0x01cf, - 'vK': 0x01e8, 'vL': 0x013d, 'vN': 0x0147, 'vO': 0x01d1, - 'vR': 0x0158, 'vS': 0x0160, 'vT': 0x0164, 'vU': 0x01d3, - 'vZ': 0x017d, 'va': 0x01ce, 'vc': 0x010d, 'vd': 0x010f, - 've': 0x011b, 'vg': 0x01e7, 'vh': 0x021f, 'vi': 0x01d0, - 'vk': 0x01e9, 'vl': 0x013e, 'vn': 0x0148, 'vo': 0x01d2, - 'vr': 0x0159, 'vs': 0x0161, 'vt': 0x0165, 'vu': 0x01d4, - 'vz': 0x017e -} - -textlet = { - 'AA': 0x00c5, 'AE': 0x00c6, 'DH': 0x00d0, 'DJ': 0x0110, - 'ETH': 0x00d0, 'L': 0x0141, 'NG': 0x014a, 'O': 0x00d8, - 'oe': 0x0153, 'OE': 0x0152, 'TH': 0x00de, 'aa': 0x00e5, - 'ae': 0x00e6, - 'dh': 0x00f0, 'dj': 0x0111, 'eth': 0x00f0, 'i': 0x0131, - 'l': 0x0142, 'ng': 0x014b, 'o': 0x00f8, 'ss': 0x00df, - 'th': 0x00fe, - # Greek (upper) - 'Gamma': 0x0393, 'Delta': 0x0394, 'Theta': 0x0398, - 'Lambda': 0x039b, 'Xi': 0x039E, 'Pi': 0x03a0, - 'Sigma': 0x03a3, 'Upsilon': 0x03a5, 'Phi': 0x03a6, - 'Psi': 0x03a8, 'Omega': 0x03a9, - # Greek (lower) - 'alpha': 0x03b1, 'beta': 0x03b2, 'gamma': 0x03b3, - 'delta': 0x03b4, 'epsilon': 0x03b5, 'zeta': 0x03b6, - 'eta': 0x03b7, 'theta': 0x03b8, 'iota': 0x03b9, - 'kappa': 0x03ba, 'lambda': 0x03bb, 'mu': 0x03bc, - 'nu': 0x03bd, 'xi': 0x03be, 'omicron': 0x03bf, - 'pi': 0x03c0, 'rho': 0x03c1, 'varsigma': 0x03c2, - 'sigma': 0x03c3, 'tau': 0x03c4, 'upsion': 0x03c5, - 'varphi': 0x03C6, # φ - 'phi': 0x03D5, # ϕ - 'chi': 0x03c7, 'psi': 0x03c8, 'omega': 0x03c9, -} - - -def _p_to_match(tex_to_chr: Dict[str, int]) -> Pattern: - # textsym and textlet both use the same sort of regex pattern. - keys = r'\\(' + '|'.join(tex_to_chr.keys()) + ')' - pstr = r'({)?' + keys + r'(\b|(?=_))(?(1)}|(\\(?= )| |{}|)?)' - return re.compile(pstr) - - -textlet_pattern = _p_to_match(textlet) - -textsym = { - 'P': 0x00b6, 'S': 0x00a7, 'copyright': 0x00a9, - 'guillemotleft': 0x00ab, 'guillemotright': 0x00bb, - 'pounds': 0x00a3, 'dag': 0x2020, 'ddag': 0x2021, - 'div': 0x00f7, 'deg': 0x00b0} - -textsym_pattern = _p_to_match(textsym) - - -def _textlet_sub(match: Match) -> str: - return chr(textlet[match.group(2)]) - - -def _textsym_sub(match: Match) -> str: - return chr(textsym[match.group(2)]) - - -def texch2UTF(acc: str) -> str: - """ - Convert single character TeX accents to UTF-8. - - Strip non-whitepsace characters from any sequence not recognized (hence - could return an empty string if there are no word characters in the input - string). - - chr(num) will automatically create a UTF8 string for big num - """ - if acc in accents: - return chr(accents[acc]) - else: - return re.sub(r'[^\w]+', '', acc, flags=re.IGNORECASE) - - -# my ($acc)=@_; -# #warn "acc = $acc\n"; -# return(chr($accents{$acc})) if (defined($accents{$acc})); -# #warn "Unknown accent '$acc'\n"; -# $acc=~s/[^\w]+//ig; -# return($acc); -# } -# - -def tex2utf(tex: str, letters: bool=True) -> str: - """Convert some TeX accents and greek symbols to UTF-8 characters. - - :param tex: Text to filter. - - :param letters: If False, do not convert greek letters or - ligatures. Greek symbols can cause problems. Ex. \phi is not - suppose to look like φ. φ looks like \varphi. See ARXIVNG-1612 - - :returns: string, possibly with some TeX replaced with UTF8 - - """ - # Do dotless i,j -> plain i,j where they are part of an accented i or j - utf = re.sub(r"/(\\['`\^\"\~\=\.uvH])\{\\([ij])\}", r"\g<1>\{\g<2>\}", tex) - - # Now work on the Tex sequences, first those with letters only match - if letters: - utf = textlet_pattern.sub(_textlet_sub, utf) - - utf = textsym_pattern.sub(_textsym_sub, utf) - - utf = re.sub(r'\{\\j\}|\\j\s', 'j', utf) # not in Unicode? - - # reduce {{x}}, {{{x}}}, ... down to {x} - while re.search(r'\{\{([^\}]*)\}\}', utf): - utf = re.sub(r'\{\{([^\}]*)\}\}', r'{\g<1>}', utf) - - # Accents which have a non-letter prefix in TeX, first \'e - utf = re.sub(r'\\([\'`^"~=.][a-zA-Z])', - lambda m: texch2UTF(m.group(1)), utf) - - # then \'{e} form: - utf = re.sub(r'\\([\'`^"~=.])\{([a-zA-Z])\}', - lambda m: texch2UTF(m.group(1) + m.group(2)), utf) - - # Accents which have a letter prefix in TeX - # \u{x} u above (breve), \v{x} v above (caron), \H{x} double accute... - utf = re.sub(r'\\([Hckoruv])\{([a-zA-Z])\}', - lambda m: texch2UTF(m.group(1) + m.group(2)), utf) - - # Don't do \t{oo} yet, - utf = re.sub(r'\\t{([^\}])\}', r'\g<1>', utf) - - # bdc34: commented out in original Perl - # $utf =~ s/\{(.)\}/$1/g; # remove { } from around {x} - - return utf diff --git a/browse/templates/400.html b/browse/templates/400.html index a0898da38..d1ddf6b5d 100644 --- a/browse/templates/400.html +++ b/browse/templates/400.html @@ -1,5 +1,6 @@ {% extends "base.html" %} {%- block title -%}{{ error.description }}{%- endblock -%} +{% block login_link %}{% endblock %} {% block head %} {{ super() }} diff --git a/browse/templates/abs/abs.html b/browse/templates/abs/abs.html index d1ef5ba7c..8fa5425cf 100644 --- a/browse/templates/abs/abs.html +++ b/browse/templates/abs/abs.html @@ -1,6 +1,8 @@ {%- extends "base.html" -%} +{% from 'abs/author_links.html' import display_authors_with_links %} +{% import 'base/macros.html' as base_macros %} - {% block title %}[{{ requested_id }}] {{ abs_meta.title }}{% endblock %} + {% block title %}[{{ requested_id }}] {{ abs_meta.title|tex2utf }}{% endblock %} {% block head %} {{ super() -}} @@ -8,6 +10,7 @@ {%- endif %} + {%- include "feedback_collector_js.html" -%} {{- generate_scholar_tags() }} {{- generate_social_media_tags() }} {% endblock head %} @@ -16,9 +19,9 @@ {% block header_class %}{% endblock %} {% block header %} -

arXiv.org > {{ abs_meta.primary_archive.id }} > arXiv:{{ requested_id }}

+

{{ config['BROWSE_SITE_LABEL'] }} > {{ abs_meta.primary_archive.id }} > arXiv:{{ requested_id }}

@@ -57,75 +60,30 @@

arXiv.org >

{% if abs_meta.primary_archive.id != abs_meta.primary_category.id %}{{ abs_meta.primary_archive.name }} > {% endif %}{{ abs_meta.primary_category.name }}

-

Title:{{ abs_meta.title|tex_to_utf|arxiv_id_urls }}

-
Authors: - {%- include "abs/author_links.html" %} -
- - - {# Below must equlvelent to what is done in the submission preview so the abstract appears as the author saw it in the preview. #} -
Abstract:{{ abs_meta.abstract|tex_to_utf_no_symb|arxiv_urlize|line_feed_to_br }}
- -
- - {%- if abs_meta.comments %} - - - - - {% endif -%} - - - - - {%- if abs_meta.msc_class %} - - - - - {% endif -%} - {%- if abs_meta.acm_class %} - - - - - {% endif -%} - {%- if abs_meta.journal_ref %} - - - - - {% endif -%} - {%- if abs_meta.doi %} - - - - - {% endif -%} - {%- if abs_meta.report_num %} - - - - - {% endif -%} - - - - - - - - -
Comments:{{ abs_meta.comments|arxiv_urlize }}
Subjects:{{ category_line() }}
MSC classes:{{ abs_meta.msc_class }}
ACM classes:{{ abs_meta.acm_class }}
Journal reference:{{ abs_meta.journal_ref }}
DOI:{{ abs_meta.doi|single_doi_url }}
Report number:{{ abs_meta.report_num }}
Cite as:{{ version_atag( abs_meta.arxiv_identifier.id, 0) }}
 (or - {{version_atag(abs_meta.arxiv_identifier.id, abs_meta.version)}} for this version) -
-
+ {{ base_macros.abs( + abs_meta.arxiv_identifier.id, + abs_meta.title, + display_authors_with_links(abs_meta, author_links), + abs_meta.abstract, + abs_meta.get_datetime_of_version(abs_meta.version), + abs_meta.primary_category.id, + comments = abs_meta.comments, + msc_class = abs_meta.msc_class, + acm_class = abs_meta.acm_class, + journal_ref = abs_meta.journal_ref, + doi = abs_meta.doi, + report_num = abs_meta.report_num, + version = abs_meta.version, + submission_history = abs_meta.version_history, + secondary_categories = abs_meta.get_secondaries(), + include_stylesheet = 0) }} +
-

Submission history

From: {{ abs_meta.submitter.name|tex_to_utf if abs_meta.submitter.name != None }} [
view email] +

Submission history

From: {{ abs_meta.submitter.name|tex2utf if abs_meta.submitter.name != None }} [view email] {#- Extra message for proxy sites (i.e. Proxy line has username and id) -#} {#- TODO: revisit this. Logic for display follows classic but is strange; sometimes a proxy is just a person's name. -#} - {% if abs_meta.proxy != None and abs_meta.proxy|wordcount > 1 %} [via {{ abs_meta.proxy.split()[0]|tex_to_utf|upper }} proxy]{% endif %} + {% if abs_meta.proxy != None and abs_meta.proxy|wordcount > 1 %} [via {{ abs_meta.proxy.split()[0]|tex2utf|upper }} proxy]{% endif %}
{%- for version_entry in abs_meta.version_history -%} {{ generate_version_entry(version_entry, abs_meta.version) }} @@ -133,14 +91,15 @@

Submission history

From: {{ abs_meta.submitter.name|tex_to_utf if abs_m
- - - {#- The following supports the arXiv Labs Bibliographic Explorer project: https://labs.arxiv.org/projects/bibexplorer -#} - {% if config['LABS_BIBEXPLORER_ENABLED'] %} - - {% endif %} +
Which authors of this paper are endorsers? | Disable MathJax (What is MathJax?) + {% include "feedback_collector.html" %} +
+ + {#- The following supports the arXiv Labs Bibliographic Explorer project: https://labs.arxiv.org/projects/bibexplorer -#} + {% if config['LABS_BIBEXPLORER_ENABLED'] %} + + {% endif %} + {% endblock content %} @@ -177,16 +136,16 @@

Submission history

From: {{ abs_meta.submitter.name|tex_to_utf if abs_m {%- macro generate_social_media_tags() -%} - - + + - + - + {%- endmacro -%} {%- macro category_line() -%} - {{- abs_meta.primary_category.unalias().display_str() -}} + {{- abs_meta.primary_category.unalias().display -}} {%- if abs_meta.secondary_categories -%} {%- for category in abs_meta.display_secondaries() -%} ; {{ category }} diff --git a/browse/templates/abs/author_links.html b/browse/templates/abs/author_links.html index 0647ca70a..a4aaef15c 100644 --- a/browse/templates/abs/author_links.html +++ b/browse/templates/abs/author_links.html @@ -1,18 +1,20 @@ -{%- macro author_list( authors ) %} +{%- macro author_list(abs_meta, authors) %} {%- for part in authors -%} {%- if part is string -%} {{- part|arxiv_urlize -}} {%- else -%} - {{part[0]}} + {{ part[0] }} {%- endif -%} {% endfor %} {%- endmacro -%} -{{- author_list( author_links[0] ) }} +{%- macro display_authors_with_links(abs_meta, author_links) %} + +{{- author_list(abs_meta, author_links[0]) }} {%- if author_links[1] %} {% if author_links[2] == 1 %} - {{author_list(author_links[1])}} + {{ author_list(author_links[1]) }} {% else %} @@ -57,3 +59,5 @@ {%- endif -%} {% endif -%} + +{%- endmacro -%} diff --git a/browse/templates/archive/archive_list_all.html b/browse/templates/archive/archive_list_all.html new file mode 100644 index 000000000..5d04f44ca --- /dev/null +++ b/browse/templates/archive/archive_list_all.html @@ -0,0 +1,40 @@ +{%- extends "base.html" -%} + +{% block title %}Archives{% endblock %} + +{% block header_h1 %} +

{{ config['BROWSE_SITE_LABEL'] }} > archives

+{% endblock %} + + +{%- block content %} +

Archives

+ + {% if bad_archive is not none %} +

No archive '{{bad_archive}}' -- available archives are:

+ {% endif %} + +

Choose one of the following archives.

+ +
    + {% for archive in archives %} + {% set id = archive[0] %} + {% set name = archive[1] %} +
  • {{name}} ({{id}})
  • + {% endfor %} +
+ +

or from one of the following archives which no longer accept submissions + (most have been subsumed into the archives listed above)

+ + +
    + {% for archive in defunct %} + {% set id = archive[0] %} + {% set name = archive[1] %} + {% set sub_to = archive[2] %} +
  • {{name}} ({{id}}, subsumed into {{sub_to}})
  • + {% endfor %} +
+ +{%- endblock %} diff --git a/browse/templates/archive/single_archive.html b/browse/templates/archive/single_archive.html new file mode 100644 index 000000000..9062122a8 --- /dev/null +++ b/browse/templates/archive/single_archive.html @@ -0,0 +1,111 @@ +{%- extends "base.html" -%} + +{% block head %} +{{ super() }} + +{% endblock head %} + +{% block title %}{{archive.name}}{% endblock %} + +{% block header_h1 %} +

+{{ config['BROWSE_SITE_LABEL'] }} > {{archive_id}} +

+{% endblock %} + + +{%- block content %} +

{{archive.name}} (since {{archive.start_date.strftime('%B %Y')}})

+ + {% if subsumed_id %} +

The {{subsumed_id}} archive has been subsumed into + {{subsuming_category['name']}} ({{subsumed_by}}).

+

{{subsuming_category['name']}} ({{subsumed_by}}) is part of the {{archive['name']}} archive.

+ {% endif %} + + +

For a specific paper, enter the identifier into the top right search box.

+ +
    +
  • Browse: +
      +
    • + new + (most recent mailing, with abstracts) +
    • +
    • recent + (last 5 mailings) +
    • +
    • current month's + {{archive_id}} listings
    • +
    • specific year/month: + + {#
      #} + {# hard coded for legacy /list for now #} + + {{ list_form.archive }} + {{ list_form.year}} {{ list_form.month}} + {{list_form.submit}} +
      +
    • +
    +
  • + +
  • Catch-up: + {# hard coded for legacy /catchup for now #} +
    + + Changes since: + + + + , view results + abstracts + +
    +
  • +
  • Search within the {{archive_id}} archive
  • + +
  • Article statistics by year:
    + {% for url, year in stats_by_year %} + {{year}} + {% endfor %} +
  • + +
+ + {% if category_list|length > 1 %} +

Categories within {{archive.name}}

+
    + {% for category in category_list %} +
  • {{category.id}} - {{category.name}} + (new, + recent, + current month) +
    {{category.description}}
    +
  • + {% endfor %} +
+ {% endif %} +{%- endblock %} diff --git a/browse/templates/base.html b/browse/templates/base.html index 0806f8c5d..ba593bb21 100644 --- a/browse/templates/base.html +++ b/browse/templates/base.html @@ -8,35 +8,29 @@ {# disable for now #} - {%- if config['BROWSE_PIWIK_ENABLED'] %} - + {%- if config['BROWSE_ANALYTICS_ENABLED'] %} + - + {% endif -%} - {%- include "feedback_collector_js.html" -%} - {% if config['BROWSE_USER_BANNER_ENABLED'] %} - + {% if config['BROWSE_USER_BANNER_ENABLED'] and (config['BROWSE_USER_BANNER_START_DATE'] < request_datetime) and (config['BROWSE_USER_BANNER_END_DATE'] > request_datetime) %} + - + {% endif -%} {%- endblock head -%} @@ -45,8 +39,8 @@ {% if config['BROWSE_USER_BANNER_ENABLED'] %} {%- include "user_banner.html" -%} {% endif -%} - {%- if config['BROWSE_PIWIK_ENABLED'] -%} - + {%- if config['BROWSE_ANALYTICS_ENABLED'] -%} + {% endif -%}
diff --git a/browse/templates/cookies.html b/browse/templates/cookies.html new file mode 100644 index 000000000..87c2bf2f1 --- /dev/null +++ b/browse/templates/cookies.html @@ -0,0 +1,81 @@ +{%- extends "base.html" -%} + +{% block title %}Your cookie configuration{% endblock %} + +{% block header_h1 %}

{{ config['BROWSE_SITE_LABEL'] }} > cookies

{% endblock header_h1 %} + +{%- block content %} +

Your cookie configuration for arXiv

+ +

You can use this form to set preferences that will apply to all arXiv.org sites for this +and future sessions when access from the same machine. Cookies are snippets of information +stored on your machine and returned to our site with each request you make (click +show additional debugging information below to see what is stored). +This facility will not work if you have disabled cookies or on browsers that do not +support cookies.

+ +

For information about the full-text formats available, consult the +viewing help. +If the cookies are set successfully, then abstract and listing +pages will automatically link to the appropriate versions of each +paper. You may have to reload previously-visited abstract/listing +pages to update the format links (Shift+Reload in many browsers), because +the old versions of the pages will be cached by your browser.

+
+ + + {% for cc in cookies_config %} + {% set id = cc['id'] %} + {% set label = cc['label'] %} + {% set name = cc['name'] %} + {% set options = cc['options'] %} + +

{{label}}

+ {% endfor %} + + + +
+ +{% if debug %} +

Debugging information: dump of current cookie data

+ + + {% for cooname in request.cookies.keys() %} + + {% if cooname in controlled_cookies %} + + {% else %} + + {% endif %} + + + {% endfor %} +
NameValue
{{cooname}}{{cooname}}{{ request.cookies[cooname] }}
+ +

Only those cookies shown emphasized in the table above are controlled +by this interface. arXiv uses other cookies such as browser and +tapir_permanent to maintain session and permanent login settings. +You can, if you want, clear all cookies using the controls in your browser.

+ +

(hide debugging information)

+{% else %} +

(show additional debugging information)

+{% endif %} + +

About cookies

+

"HTTP Cookies" allow clients to retain certain state +information to be included in future requests to a particular +server. Cookies are now defined +by RFC 6265. It is +not necessary to use/permit cookies to read articles on arXiv.org.

+{% endblock content %} diff --git a/browse/templates/feedback_collector.html b/browse/templates/feedback_collector.html new file mode 100644 index 000000000..71143e7bd --- /dev/null +++ b/browse/templates/feedback_collector.html @@ -0,0 +1,2 @@ +Browse v0.2.1 released 2019-04-18 +   diff --git a/browse/templates/feedback_collector_js.html b/browse/templates/feedback_collector_js.html index 885aabc79..b0c05510b 100644 --- a/browse/templates/feedback_collector_js.html +++ b/browse/templates/feedback_collector_js.html @@ -8,8 +8,8 @@ }); }, fieldValues: { - "components": ["15700"], // Browse component. - "versions": ["14132"], // Release browse-0.1 + "components": ["15700"], // Jira ID for browse component + "versions": ["14153"], // Jira ID for browse-0.2.1 release "customfield_11401": window.location.href } }; diff --git a/browse/templates/home/home.html b/browse/templates/home/home.html new file mode 100644 index 000000000..66961972d --- /dev/null +++ b/browse/templates/home/home.html @@ -0,0 +1,95 @@ +{%- extends "base.html" -%} +{% block title %}arXiv.org e-Print archive{% endblock %} +{% block head %} + {{ super() -}} +{% endblock head %} + +{% block login_link %}{% include 'login.html' %}{% endblock %} +{% block body_id %}id="front"{% endblock %} + +{%- block content %} +{#- TODO: display order in taxonomy? -#} +

Open access to {% if document_count -%}{{ "{:,}".format(document_count) }}{%- endif %} + e-prints in the fields of physics, mathematics, computer science, quantitative biology, quantitative finance, statistics, electrical engineering and systems science, and economics. Submissions to arXiv should conform to Cornell University academic standards. arXiv is owned and operated by Cornell University, a private not-for-profit educational institution. arXiv is funded by Cornell University, the Simons Foundation and by the member institutions.

+
+{#- /multi sends to either search, catchup or form interface based on which button is hit. -#} +
+ Subject search and browse: + + + + +
+ + +

+{%- include "home/news.html" -%} +See cumulative "What's New" pages. +Read robots beware before attempting any automated download +

+{#- TODO: define display order in taxonomy? -#} +{{- group_section(('grp_physics','grp_math','grp_cs','grp_q-bio','grp_q-fin','grp_stat','grp_eess','grp_econ')) }} + +
+{#- TODO: remove because of new footer? -#} +

About arXiv

+ + +{% endblock content -%} + +{%- macro group_section(group_keys) -%} + {%- for group_key in group_keys -%} +

{{ groups[group_key].name }}

+
    + {% for archive_key, archive_details in archives.items()|sort(attribute='1.name') if archives[archive_key].in_group == group_key %} + {%- set archive_search_url = url_for('search_archive', archive=archive_key) -%} + {%- set archive_url = url_for('browse.archive', archive=archive_key) if archive_key != 'cs' else 'https://arxiv.org/corr' -%} + {%- set archive_name = archives[archive_key].name if archive_key != 'cs' else 'Computing Research Repository' -%} + + {#- TODO: url_for /list, /catchup -#} +
  • + {{ archive_name }} ({{ archive_key if archive_key != 'cs' else 'CoRR' }} new, recent, search{% if 0 %}, last month{% endif %}) + {%- if group_key == 'grp_physics' and archive_key not in categories -%}
    includes: + {%- elif archive_key == 'cs' -%}
    includes (see detailed description): + {%- elif archive_key in ('eess', 'econ') -%}
    includes (see detailed description): + {%- elif group_key != 'grp_physics' -%}
    includes (see detailed description): + {%- endif -%} + {% for category_key, category_details in categories.items()|sort(attribute='1.name') if categories[category_key].in_archive == archive_key %} + {% if not (loop.first and loop.last and group_key == 'grp_physics') %}{{ categories[category_key].name }}{% if not loop.last %}; {% endif %}{% endif %} + {% endfor %} +
  • + {% endfor %} +
+ {% endfor %} +{%- endmacro -%} diff --git a/browse/templates/home/news.html b/browse/templates/home/news.html new file mode 100644 index 000000000..6fd2773fb --- /dev/null +++ b/browse/templates/home/news.html @@ -0,0 +1,3 @@ +{#- News blurbs appear at the top of the home page. Generally there should be no more than four items. -#} +14 Jan 2019: The annual update from the arXiv team is now available
+5 Sept 2018: arXiv looks to the future with move to Cornell CIS
diff --git a/browse/templates/list/base.html b/browse/templates/list/base.html new file mode 100644 index 000000000..6f3673342 --- /dev/null +++ b/browse/templates/list/base.html @@ -0,0 +1,155 @@ +{%- extends "base.html" -%} + +{% block title %}{{list_ctx_name}} {{list_month_name}} {{list_year}}{% endblock %} + +{% block header_h1 %} +

+ {{ config['BROWSE_SITE_LABEL'] }} > + {{list_ctx_id}} +

+{% endblock %} + +{%- block content %} +
+
+

{{list_ctx_name}}

+ + {% block list_index %} + {# list index is empty by default #} + {% endblock %} + + {{ page_divs() }} + + {% block items %} + {{ item_list( listings, '' ) }} + {% endblock %} + + {{ page_divs() }} +
+
+{%- endblock %} + + +{%- macro page_divs() -%} +
[ total of {{count}} entries + {{ pages() }} + ] +
+
[ showing {{shown}} entries per page: + {{ show_link('fewer',mf_fewer) }} | + {{ show_link('more', mf_more) }} | + {{ show_link('all', mf_all) }} ] +
+{%- endmacro -%} + + +{% macro item_list(items, heading) %} +
+ {% if heading %} +

{{heading}}

+ {% endif %} + + {% for item in items %} + {% set article = item['article'] %} + {% set ids = article.arxiv_identifier.ids %} + {% set listing_index = item['list_index'] %} +
+ [{{listing_index}}] + + arXiv:{{ article.arxiv_identifier.ids }} + + {{ type_info( item ) }} + {{ dl_links( article ) }} +
+
+
+
Title: + {{article.title}} +
+
+ {{- do_author_list( article ) -}} +
+ + {% if article.comments %} +
Comments: + {{article.comments|arxiv_urlize}} +
+ {% endif %} + + {% if article.journal_ref %} +
Journal-ref: + {{article.journal_ref}} +
+ {% endif %} + +
Subjects: + {{article.primary_category.display}} +
+ +
+
+ {% endfor %} +
+{% endmacro %} + + +{%- macro do_author_list(article) -%} + {% set au_list_tup = author_links[article.arxiv_id_v] %} + {%- if not au_list_tup or not au_list_tup[0] -%} + (author list is unavailable) + {%- else -%} + {%- for part in au_list_tup[0] -%} + {%- if part is string -%} + {{- part|arxiv_urlize -}} + {%- else -%} + {{part[0]}} + {%- endif -%} + {% endfor %} + {%- endif -%} +{%- endmacro -%} + + +{%- macro pages() -%} +{% if pg|length != 0 or not viewing_all %} +: + {%- for pg in paging -%} + {% if pg.nolink is defined %} + {{pg.nolink}} + {% else %} + {{pg.txt}} + {% endif %} + {% endfor %} +{% endif %} +{%- endmacro -%} + + +{%- macro show_link( txt, to_show ) %} + {% if to_show %} + + {{txt}} + {% else %} + {{txt}} + {% endif %} +{%- endmacro -%} + +{%- macro dl_links( article ) -%} + {% set id=article.arxiv_id %} + {% set downloads = downloads[ article.arxiv_id_v ] %} + [pdf + {%- if 'ps' in downloads -%} + , ps + {%- endif -%} + {%- if 'other' in downloads -%} + , other + {%- endif -%} + ] +{%- endmacro -%} + +{% macro type_info( item ) %} + {% if item['listingType'] == 'cross' %} + (cross-list from {{item['primary']}}) + {% endif %} + {% if item['listingType'] == 'rep' %} + (replaced) + {% endif %} +{% endmacro %} diff --git a/browse/templates/list/month.html b/browse/templates/list/month.html new file mode 100644 index 000000000..fcfda4aad --- /dev/null +++ b/browse/templates/list/month.html @@ -0,0 +1,5 @@ +{%- extends "list/base.html" -%} + +{% block list_index %} +

Authors and titles for {{pubmonth.strftime('%B %Y')}}

+{% endblock %} diff --git a/browse/templates/list/new.html b/browse/templates/list/new.html new file mode 100644 index 000000000..18b133599 --- /dev/null +++ b/browse/templates/list/new.html @@ -0,0 +1,38 @@ +{%- extends "list/base.html" -%} + +{# +New is quite different than the other list pages. +It has an index of types of items. +It needs to display the submitted dates and announced dates. +It needs to do headings for the new/cross/rep sections. +#} + +{% block list_index %} +{{ listdate() }} +{{ type_index() }} +{% endblock %} + +{% block items %} + {% for sec in sub_sections_for_types %} + {{ item_list(sec['items'], sec['heading']) }} + {% endfor %} +{% endblock %} + +{% macro type_index() %} + {% if index_for_types %} +
    + {% for ityp in index_for_types %} +
  • {{ityp[0]}}
  • + {% endfor %} +
+{% endif %} +{% endmacro %} + +{% macro listdate() %} + {% if announced %} + + {% endif %} +{% endmacro %} diff --git a/browse/templates/list/recent.html b/browse/templates/list/recent.html new file mode 100644 index 000000000..7e9c578ea --- /dev/null +++ b/browse/templates/list/recent.html @@ -0,0 +1,24 @@ +{%- extends "list/base.html" -%} + +{% block list_index %} +

Authors and titles for recent submissions

+ + +{% endblock %} + + +{% macro listdate() %} + {% if announced %} + + {% endif %} +{% endmacro %} diff --git a/browse/templates/list/year.html b/browse/templates/list/year.html new file mode 100644 index 000000000..8391a4dbe --- /dev/null +++ b/browse/templates/list/year.html @@ -0,0 +1,5 @@ +{%- extends "list/base.html" -%} + +{% block list_index %} +

Authors and titles for {{pubmonth.strftime('%Y')}}

+{% endblock %} diff --git a/browse/templates/login.html b/browse/templates/login.html new file mode 100644 index 000000000..a3cbf2e76 --- /dev/null +++ b/browse/templates/login.html @@ -0,0 +1,7 @@ + diff --git a/browse/templates/stats/base.html b/browse/templates/stats/base.html new file mode 100644 index 000000000..f36e88a98 --- /dev/null +++ b/browse/templates/stats/base.html @@ -0,0 +1,14 @@ +{%- extends "base.html" -%} +{# Do not show login status on stats pages for now, per classic #} + +{% block head %} + {{ super() -}} + {# TODO: upgrade d3 to v5.x? #} + +{% endblock head %} + +{% block header_h1 %}

{{ config['BROWSE_SITE_LABEL'] }} > stats > server usage

{% endblock %} + +{%- block content %} +

See also other arXiv usage statistics.

+{% endblock content %} diff --git a/browse/templates/stats/monthly_downloads.html b/browse/templates/stats/monthly_downloads.html new file mode 100644 index 000000000..222fd5ec9 --- /dev/null +++ b/browse/templates/stats/monthly_downloads.html @@ -0,0 +1,21 @@ +{%- extends "stats/base.html" -%} + +{% block title %}Monthly Download Rates{% endblock %} +{% block header_h1 %}

{{ config['BROWSE_SITE_LABEL'] }} > stats > monthly downloads

{% endblock %} + +{% block content %} +

arXiv Monthly Download Rates [CSV]

+ +{% include "stats/monthly_downloads_js.html" %} + +

Blue: Number of downloads per month.
Click and drag in the small graph below to pan or zoom.

+

Total number of downloads {% if most_recent_dt %}through {{ most_recent_dt.strftime('%B %Y') }}{% endif %} = {{ "{:,}".format(total_downloads) }}

+

+ Caveats: While we have attempted to extract download data representing unique full-text downloads by real users, there are many factors which affect accuracy. These factors include: 1) the data is from the main arXiv site and the + arXiv mirrors, though some mirror data is incomplete; 2) only web downloads are included (and not FTP or email "downloads" that were formerly supported); 3) we have counted downloads according to the COUNTER algorithm which excludes + rapid repeat downloads; 4) we have attempted to identify and remove robot or automated downloads from the count (false positives lead to undercounting, failing to identify robots leads to overcounting); 5) data prior to 2009 has not been cleaned + with as much care as later data, it shows trends nonetheless. +

+ +{{ super() }} +{% endblock content %} diff --git a/browse/templates/stats/monthly_downloads_js.html b/browse/templates/stats/monthly_downloads_js.html new file mode 100644 index 000000000..84833c98d --- /dev/null +++ b/browse/templates/stats/monthly_downloads_js.html @@ -0,0 +1,160 @@ + + diff --git a/browse/templates/stats/monthly_submissions.html b/browse/templates/stats/monthly_submissions.html new file mode 100644 index 000000000..216f26fb8 --- /dev/null +++ b/browse/templates/stats/monthly_submissions.html @@ -0,0 +1,16 @@ +{%- extends "stats/base.html" -%} + +{% block title %}Monthly Submissions{% endblock %} +{% block header_h1 %}

{{ config['BROWSE_SITE_LABEL'] }} > stats > monthly submission rates

{% endblock %} + +{% block content %} +

arXiv Monthly Submission Rates [CSV]

+{% include "stats/monthly_submissions_js.html" %} +

Blue: Number of new submissions received during each month since {{ arxiv_start_dt.strftime('%B %Y') }}.
+ Hover over the graph to see the exact count for a given month.

+

Total number of submissions shown in graph as of {{ current_dt.strftime('%B %-d, %Y') }} (after {{ "%.1f"|format(arxiv_age_years|float) }} years) = {{ "{:,}".format(num_submissions) }}

+

+ The total number of submissions excludes {{ "{:,}".format(num_migrated) }} articles that were migrated to arXiv rather than being submitted directly, and includes {{ "{:,}".format(num_deleted) }} articles that have been deleted. The total number of articles available is {{ "{:,}".format(num_submissions_adjusted) }}. +

+{{ super() }} +{% endblock content %} diff --git a/browse/templates/stats/monthly_submissions_js.html b/browse/templates/stats/monthly_submissions_js.html new file mode 100644 index 000000000..b8f9bd946 --- /dev/null +++ b/browse/templates/stats/monthly_submissions_js.html @@ -0,0 +1,199 @@ + + + diff --git a/browse/templates/stats/today.html b/browse/templates/stats/today.html new file mode 100644 index 000000000..2309d99a0 --- /dev/null +++ b/browse/templates/stats/today.html @@ -0,0 +1,16 @@ +{%- extends "stats/base.html" -%} + +{% block title %}hourly usage{% endblock %} + +{% block content %} + +

arXiv Web Server Usage from {{ requested_dt.strftime('%A, %B %-d, %Y') }} ({{ config['BROWSE_SITE_LABEL'] }} site only) [CSV]

+{% if normal_count > 0 %} +{% include "stats/today_js.html" %} +

Total number of connections = {{ "{:,}".format(normal_count) }} (+{{ "{:,}".format(admin_count) }} local & administrative connections) +{% else %} +

No data currently available. +{% endif %} +
Current local time is {{ current_dt.strftime('%A, %B, %-d, %Y %H:%M%:%S') }} US/Eastern

+{{ super() }} +{% endblock content %} diff --git a/browse/templates/stats/today_js.html b/browse/templates/stats/today_js.html new file mode 100644 index 000000000..0cbc2c60f --- /dev/null +++ b/browse/templates/stats/today_js.html @@ -0,0 +1,125 @@ + + diff --git a/browse/templates/tb/404.html b/browse/templates/tb/404.html new file mode 100644 index 000000000..62d990fa8 --- /dev/null +++ b/browse/templates/tb/404.html @@ -0,0 +1,32 @@ +{%- extends "tb/base.html" -%} + +{% block title %} +{% if arxiv_id %} +[{{ arxiv_id }}] Article identifier not {% if not_found %}found{% else %}recognized{% endif %} +{% elif missing_id %} +No article ID specified +{% else %} +Trackback not found +{% endif %} +{% endblock %} + +{%- block content %} +{% if arxiv_id or missing_id %} + {% if arxiv_id and not_found %} +

Article 'arXiv:{{ arxiv_id }}' not found

+

The identifier you have specified (arXiv:{{ arxiv_id }}) may be invalid. Please inform help@arxiv.org if you believe that the identifier should correspond to a valid paper in arXiv.

+ {% elif arxiv_id %} +

Article identifier '{{ arxiv_id }}' not recognized

+ {% else %} +

No article identifier specified

+ {% endif %} +

We were unable to extract an arXiv article ID from the URL specified. You may:

+{% else %} +

Trackback not found

+

Badly formed redirect request. If the link you followed was on arXiv, please report this error to us. If it was elsewhere, we suggest you report it to the maintainer of that site.

+{% endif %} +
    +
  • View recent trackbacks
  • +
  • View the trackbacks for a particular article at https://arxiv.org{{ url_for('.tb', arxiv_id='') }}{arXiv_id}
  • +
+{% endblock content %} diff --git a/browse/templates/tb/base.html b/browse/templates/tb/base.html new file mode 100644 index 000000000..e86a302b5 --- /dev/null +++ b/browse/templates/tb/base.html @@ -0,0 +1,27 @@ +{%- extends "base.html" -%} +{# Do not show login status on these pages for now, per classic #} + +{% block head %} + {{ super() -}} + + +{% endblock head %} + +{%- block content %} +
+

About trackbacks

+

By sending a trackback, you can notify + arXiv.org that you have created a web page that references a paper. Popular + blogging software supports trackback: you can send us a trackback about this + paper by giving your software the following trackback URL:

+
https://arxiv.org/trackback/{arXiv_id}
+

Some blogging software supports trackback autodiscovery -- in this + case, your software will automatically send a trackback as soon as your + create a link to our abstract page. See our + trackback help page for more information.

+
+ +{% endblock content %} diff --git a/browse/templates/tb/macros.html b/browse/templates/tb/macros.html new file mode 100644 index 000000000..4b93e4531 --- /dev/null +++ b/browse/templates/tb/macros.html @@ -0,0 +1,7 @@ +{%- macro generate_trackback_link(trackback, include_posted_date=True) -%} + + {{- trackback.title|entity_to_utf|truncate(150) }} [ + {%- if trackback.blog_name %}{{ trackback.blog_name|trim|entity_to_utf|truncate(50) -}} @ {% endif -%} + {%- if trackback.has_valid_url %}{{ trackback.display_url|truncate(30,False) }}{% else %}INVALID-URL{% endif -%}] + {%- if include_posted_date %} [trackback posted {{ trackback.posted_datetime.strftime('%a, %-d %b %Y %H:%M:%S %Z') }}]{% endif %} +{%- endmacro -%} diff --git a/browse/templates/tb/recent.html b/browse/templates/tb/recent.html new file mode 100644 index 000000000..efffeb707 --- /dev/null +++ b/browse/templates/tb/recent.html @@ -0,0 +1,42 @@ +{%- extends "tb/base.html" -%} +{%- from 'tb/macros.html' import generate_trackback_link -%} +{%- import 'base/macros.html' as base_macros -%} + +{% block title %}Recent Trackbacks{% endblock %} +{% block header_h1 %}

{{ config['BROWSE_SITE_LABEL'] }} > recent trackbacks

{% endblock %} + +{%- block content %} +

Recent Trackbacks

+

Trackbacks indicate external web sites that link to articles in arXiv.org. Trackbacks may not reflect the opinion of arXiv.org or of that article's authors.

+
+ View last + trackbacks + + +
+ {% if recent_trackback_pings -%} + {%- set ns = namespace(cur_tb_ymd='', cur_tb_url='') -%} + {%- for recent_tb in recent_trackback_pings -%} + {%- set this_tb_ymd = recent_tb[0].posted_datetime.strftime('%B %-d, %Y') -%} + {%- set this_tb_url = recent_tb[0].url -%} + {%- if this_tb_ymd != ns.cur_tb_ymd -%} + {%- set ns.cur_tb_ymd = this_tb_ymd -%} +

{{ this_tb_ymd }}

+ {%- endif -%} + {%- if this_tb_url != ns.cur_tb_url -%} + {%- set ns.cur_tb_url = this_tb_url -%} +

{{- generate_trackback_link(recent_tb[0], include_posted_date=False) }} links the following articles:

+
    + {%- for article in article_map[this_tb_url] -%} +
  • {{ article[1]|tex2utf }} [{{ article[0] }}]
  • + {%- endfor -%} +
+ {%- endif -%} + {%- endfor -%} + {%- endif %} + {{ super() }} +{% endblock content %} diff --git a/browse/templates/tb/tb.html b/browse/templates/tb/tb.html new file mode 100644 index 000000000..ab02b8d9f --- /dev/null +++ b/browse/templates/tb/tb.html @@ -0,0 +1,42 @@ +{%- extends "tb/base.html" -%} +{%- from 'abs/author_links.html' import display_authors_with_links -%} +{%- import 'base/macros.html' as base_macros -%} +{%- from 'tb/macros.html' import generate_trackback_link -%} + +{% block title %}Article Trackbacks{% endblock %} +{% block header_h1 %}

{{ config['BROWSE_SITE_LABEL'] }} > article trackbacks

{% endblock %} +{# Disable login on this page for now, per classic #} +{% block login_link %}{% endblock %} + +{%- block content %} + {% if trackback_pings %} +

Trackbacks for {{ arxiv_identifier.id }}

+ {% for tb in trackback_pings %} +

{{- generate_trackback_link(tb) -}}

+ {% endfor %} +
+ {#- abstract field is deliberately suppressed in call to abs macro. -#} + {%- if abs_meta -%} + {{ base_macros.abs( + abs_meta.arxiv_identifier.id, + abs_meta.title, + display_authors_with_links(abs_meta, author_links), + '', + abs_meta.get_datetime_of_version(abs_meta.version), + abs_meta.primary_category.id, + comments = abs_meta.comments, + msc_class = abs_meta.msc_class, + acm_class = abs_meta.acm_class, + journal_ref = abs_meta.journal_ref, + doi = abs_meta.doi, + report_num = abs_meta.report_num, + version = abs_meta.version, + submission_history = abs_meta.version_history, + secondary_categories = abs_meta.get_secondaries()) }} +
+ {%- endif -%} + {% else %} +

There are no trackback pings recorded for {{ arxiv_identifier.id }}.

+ {% endif %} + {{ super() }} +{% endblock content %} diff --git a/browse/templates/user_banner.html b/browse/templates/user_banner.html index 19d33ec10..8ec524811 100644 --- a/browse/templates/user_banner.html +++ b/browse/templates/user_banner.html @@ -1,4 +1,19 @@ -
-

Attention Readers: arXiv will be unavailable due to maintenance ~20 minutes starting 05:00 ET (09:00 UTC) on Thursday, November 1, 2018.

- + diff --git a/browse/templates/year.html b/browse/templates/year.html new file mode 100644 index 000000000..d3c205c10 --- /dev/null +++ b/browse/templates/year.html @@ -0,0 +1,54 @@ +{% extends "base.html" %} + +{% block title %}{{archive.name}}{% endblock %} + + +{% block header_h1 %} +

+{{ config['BROWSE_SITE_LABEL'] }} > {{archive_id}} > {{year}} +

+{% endblock %} + + +{% block content %} +

{{archive.name}}

+

Article statistics for {{year}}

+

Available montly lists with counts of {{archive_id}} articles + +cross-listings to {{archive_id}} in {{year}} +(each '|' represents 20 articles):

+ +

TODO fake data as of 2019-03-04

+ +
    + {% for month in months %} +
  • + {{month['yymm']}} + + {% for txt, url in month['art'] -%} + {%- if url -%} + {{txt}} + {%- else -%} + {{txt}} + {%- endif -%} + {%- endfor %} + {{month['new']|default('0',true)}} + {{month['cross']|default('0',true)}} +
  • + {% endfor %} +
+ +

{{year}} totals: {{listing['new_count']|default('unknown')}} articles + {{listing['cross_count']|default('unknown')}} cross-lists

+ +

Other years: +

  • Article statistics by year:
    + {% for url, year in stats_by_year %} + {% if url %} + {{year}} + {% else %} + {{year}} + {% endif %} + {% endfor %} +
  • + +

    + +{% endblock %} diff --git a/browse/util/__init__.py b/browse/util/__init__.py deleted file mode 100644 index a7a09d0fa..000000000 --- a/browse/util/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Utility code for browse.""" diff --git a/browse/util/clickthrough.py b/browse/util/clickthrough.py deleted file mode 100644 index 2e8b849cb..000000000 --- a/browse/util/clickthrough.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -Functions to create hashes for clickthrough links. - -These are used to log to the web acccess logs when the DOI and -bookmarking links are clicked. - -The hash is used to prevent malicious use of the click through -controller to create links that look like they are on arXiv but get -redirected someplace undesirable. -""" - -from typing import Callable -import hashlib - - -def create_hash(secret: str, url: str) -> str: - """Create a hash of the secret and url.""" - s = f'{secret}{url}' - return str(hashlib.md5(s.encode()).hexdigest()[0:8]) - - -def is_hash_valid(secret: str, url: str, ct_hash: str) -> bool: - """Check that ct_hash was generated by create_hash for secret and url.""" - return ct_hash == create_hash(secret, url) - - -def create_ct_url(secret: str, url_for: Callable[..., str], url: str) -> str: - """Create a URL to the clickthrough service with a valid hash.""" - url_for('browse.clickthrough', url=url, v=create_hash(secret, url)) - return url_for('browse.clickthrough', url=url, v=create_hash(secret, url)) diff --git a/browse/util/id_patterns.py b/browse/util/id_patterns.py deleted file mode 100644 index 325242fa7..000000000 --- a/browse/util/id_patterns.py +++ /dev/null @@ -1,329 +0,0 @@ -r"""Patterns and functions to detect arXiv ids and Urls in text. - -Functions to detech arXiv ids, URLs and DOI in text. -Functions to transform them to tags. - -These were originally jinja filters but became a little too big -for that so they were split out and made more general so they didn't -rely on the Flask context. - -These all use expect input of Markup or non-markup text and return -Markup objects. This is because the that get added need to avoid -double escaping. - -There are several classes of patterns we want to match but there is -some overlap in these patterns. To avoid looking for and parsing HTML in each -jinja filter, detecting these patterns is combined. - -So far we are looking for: -DOIs DOI: 10.1145/0001234.1234567 -arXiv IDS: 1234.12345 1234.12345v1 hep-ph1307.1843 -HTTP URLs: http://something.org/myPaper/1234.12345 -FTP URLs: ftp://example.com/files/1234.12345 - -Just matching for arXiv ids with \d{4}\.\d{4,5} will match several of -these. To deal with this we are priortizing the matches and -interupting once one is found. - -We should probably match DOIs first because they are the source of a -lot of false positives for arxiv matches. -""" -from typing import Optional, List, Pattern, Match, Tuple, Callable -import re -from dataclasses import dataclass - -from urllib.parse import quote -from jinja2 import Markup, escape - -from arxiv import taxonomy - - -@dataclass -class Matchable: - """Class for paterns.""" - - examples: List[str] - pattern: Pattern - - -def _identity(x: str)->str: - """Identity funciton for default in some places.""" - return x - - -doi_patterns = [ - Matchable(['10.1145/0001234.1234567'], - re.compile(r'(?P10.\d{4,9}/[-._;()/:A-Z0-9]+)', re.I)) -] -"""List of Matchable for DOIs in text. - -We should probably match DOIs first because they are the source of a -lot of false positives for arxiv matches. - -Only using the most general express from -https://www.crossref.org/blog/dois-and-matching-regular-expressions/ -""" - -_archive = '|'.join([re.escape(key) for key in taxonomy.definitions.ARCHIVES.keys()]) -"""string for use in Regex for all arXiv archives""" - -_category = '|'.join([re.escape(key) for key in taxonomy.definitions.CATEGORIES.keys()]) - -_arxiv_id_prefix = r'(?Par[xX]iv:)?' -"""Attempt to catch the arxiv prefix in front of arxiv ids so it can be -included in the tag anchor. ARXIVNG-1284""" - -basic_arxiv_id_patterns = [ - Matchable(['math/0501233', 'hep-ph/0611734', 'gr-qc/0112123'], - re.compile(_arxiv_id_prefix + r'(?P(%s)\/\d{2}[01]\d{4}(v\d*)?)' - % _archive, re.I)), - Matchable(['1609.05068', '1207.1234v1', '1207.1234', '1807.12345', - '1807.12345v1', '1807.12345v12'], - re.compile(r'(?\d{4}\.\d{4,5}(v\d*)?)', - re.I)), - Matchable(['math.GR/0601136v3', 'math.GR/0601136'], - re.compile(_arxiv_id_prefix + r'(?P(%s)\/\d{2}[01]\d{4}(v\d*)?)' - % _category, re.I)) -] - -OKCHARS = r'([a-z0-9,_.\-+~:]|%[a-f0-9]*)' -"""Chacters that are acceptable during PATH, QUERY and ANCHOR parts""" - -HOST_NAME = r'(?:[a-z0-9][a-z0-9\-.:]+[a-z0-9])' -"""Regex used to match host names in arXiv urlize. - -This is not a perfect regex for a host name, It accepts only a sub-set -of hostnames to meet the needs of arxiv. - -HOST_NAME must end with a simplified character to avoid capturing a -period. -""" - -PATH = rf'(?P(/{OKCHARS}*)+)?' -"""Regex for path part of URLs for use in urlize""" - -QUERY = rf'(?P\?(&?({OKCHARS}*(={OKCHARS}*)?))*)?' -"""Regex for query part of URLs for use in urlize""" - -ANCHOR = rf'(?P#({OKCHARS}|/)*)?' -"""Regex for anchor part of URLs for use in urlize""" - -URLINTEXT_PAT = re.compile(r'(?P(?:https?://)' - f'{HOST_NAME}{PATH}{QUERY}{ANCHOR})', - re.I) -"""Regex to match URLs in text.""" - -FTP_PAT = re.compile(rf'(?P(?:ftp://)({OKCHARS}|(@))*{PATH})', re.I) -"""Regex to match FTP URLs in text.""" - -basic_url_patterns = [ - Matchable(['http://something.com/bla'], URLINTEXT_PAT), - Matchable(['ftp://something.com/bla'], FTP_PAT) -] -"""List of Matchable to use when finding URLs in text""" - -bad_arxiv_id_patterns = [ - re.compile('vixra', re.I), # don't need to link to vixra -] -"""List of Regex patterns that will cause matching to be skipped for -the token.""" - -dois_ids_and_urls = basic_url_patterns + doi_patterns + basic_arxiv_id_patterns -"""List of Matchable to use when finding DOIs, arXiv IDs, and URLs. - -URLs are first because some URLs contain DOIs or arXiv IDS. - -DOI are before arXiv ids because many DOIs are falsely matched by the -arxiv_id patterns. -""" - - -_bad_endings = ['.', ',', ':', ';', '&', '(', '[', '{'] -"""These should not appear at the end of URLs because they are likely -part of the surrounding text""" - - -def _find_match(patterns: List[Matchable], token: str) \ - -> Optional[Tuple[Match, Matchable]]: - """Find first in patterns that is found in txt.""" - for chgMtch in patterns: - if chgMtch.pattern.flags: - fnd = re.search(chgMtch.pattern, token) - else: - fnd = re.search(chgMtch.pattern, token, re.I) - if fnd is not None: - return (fnd, chgMtch) - return None - - -def _transform_token(patterns: List[Matchable], - bad_patterns: List[Pattern], - id_to_url: Callable[[str], str], - doi_to_url: Callable[[str], str], - url_to_url: Callable[[str], str], - token: str) -> str: - """ - Transform a token from text to one of the Matchables. - - This only transforms against the first of Matchable matched. - Matching on this token will be skipped if any of the bad_patterns - match the token (that is re.search). - """ - id_to_url = id_to_url or (lambda x: x) - doi_to_url = doi_to_url or (lambda x: x) - url_to_url = url_to_url or (lambda x: x) - - for pattern in bad_patterns: - if re.search(pattern, token): - return token - - mtch = _find_match(patterns, token) - if mtch is None: - return token - - (match, _) = mtch - keys = match.groupdict().keys() - if 'arxiv_id' in keys: - (front, back) = _arxiv_id_sub(match, id_to_url) - elif 'doi' in keys: - (front, back) = _doi_sub(match, doi_to_url) - elif 'url' in keys: - (front, back) = _url_sub(match, url_to_url) - else: - # unclear how to substitute in for this match - return token - - if back: - t_back = _transform_token(patterns, bad_patterns, - id_to_url, doi_to_url, url_to_url, back) - return front + Markup(t_back) - else: - return front - - -def _arxiv_id_sub(match: Match, id_to_url: Callable[[str], str]) \ - -> Tuple[Markup, str]: - """Return match.string transformed for a arxiv id match.""" - aid = match.group('arxiv_id') - prefix = 'arXiv:' if match.group('arxiv_prefix') else '' - - if aid[-1] in _bad_endings: - arxiv_url = id_to_url(aid)[:-1] - anchor = aid[:-1] - back = aid[-1] + match.string[match.end():] - else: - arxiv_url = id_to_url(aid) - anchor = prefix + aid - back = match.string[match.end():] - - front = match.string[0:match.start()] - return (Markup(f'{front}{anchor}'), back) - - -def _doi_sub(match: Match, doi_to_url: Callable[[str], str]) \ - ->Tuple[Markup, str]: - """Return match.string transformed for a DOI match.""" - doi = match.group('doi') - if(doi[-1] in _bad_endings): - back = match.string[match.end():] + doi[-1] - doi = doi[:-1] - else: - back = match.string[match.end():] - - quoted_doi = quote(doi, safe='/') - doi_url = f'https://dx.doi.org/{quoted_doi}' - doi_url = doi_to_url(doi_url) - - anchor = escape(doi) - front = match.string[0:match.start()] - return (Markup(f'{front}{anchor}'), back) - - -def _url_sub(match: Match, url_to_url: Callable[[str], str]) \ - ->Tuple[Markup, str]: - """Return match.string transformed for a URL match.""" - url = match.group('url') - if url.startswith('https'): - anchor = 'this https URL' - elif url.startswith('http'): - anchor = 'this http URL' - elif url.startswith('ftp'): - anchor = 'this ftp URL' - else: - anchor = 'this URL' - - front = match.string[0:match.start()] - if url[-1] in _bad_endings: - back = url[-1] + match.string[match.end():] - url = url[:-1] - else: - back = match.string[match.end():] - - url = url_to_url(url) - return (Markup(f'{front}{anchor}'), back) - - -_word_split_re = re.compile(r'(\s+)') -"""Regex to split to tokens during _to_tags. - -Capturing group causes the splitting spaces to be included -in the returned list. -""" - - -def _to_tags(patterns: List[Matchable], - bad_patterns: List[Pattern], - id_to_url: Callable[[str], str], - doi_to_url: Callable[[str], str], - url_to_url: Callable[[str], str], - text: str)-> str: - """Split text to tokens, do _transform_token for each, return results.""" - def transform_token(tkn: str)-> str: - return _transform_token(patterns, bad_patterns, - id_to_url, doi_to_url, url_to_url, - tkn) - - if not hasattr(text, '__html__'): - text = Markup(escape(text)) - - words = _word_split_re.split(text) - for i, token in enumerate(words): - token_2 = transform_token(token) - if token_2 != token: - words[i] = token_2 - result = u''.join(words) - return Markup(result) - - -def do_id_to_tags(id_to_url: Callable[[str], str], - text: str)-> str: - """Transform arxiv ids in text to tags.""" - return _to_tags(basic_arxiv_id_patterns, - bad_arxiv_id_patterns, - id_to_url, _identity, _identity, - text) - - -def do_dois_id_urls_to_tags(id_to_url: Callable[[str], str], - doi_to_url: Callable[[str], str], - text: str)-> str: - """Transform DOIs, arxiv ids and URLs in text to tags.""" - return _to_tags(dois_ids_and_urls, - bad_arxiv_id_patterns, - id_to_url, doi_to_url, _identity, - text) - - -def do_dois_to_tags(doi_to_url: Callable[[str], str], text: str)->str: - """Transform DOIs in text to tags.""" - return _to_tags(doi_patterns, [], _identity, doi_to_url, _identity, text) - - -def do_dois_arxiv_ids_to_tags(id_to_url: Callable[[str], str], - doi_to_url: Callable[[str], str], - text: str)->str: - """Transform DOIs and arXiv IDs to tags.""" - return _to_tags(doi_patterns + basic_arxiv_id_patterns, - bad_arxiv_id_patterns, - id_to_url, doi_to_url, _identity, - text) diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 000000000..69fe55ecf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,19 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/source/api/browse.config.rst b/docs/source/api/browse.config.rst new file mode 100644 index 000000000..db15bb759 --- /dev/null +++ b/docs/source/api/browse.config.rst @@ -0,0 +1,7 @@ +browse.config module +======================================== + +.. automodule:: browse.config + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.controllers.abs_page.rst b/docs/source/api/browse.controllers.abs_page.rst new file mode 100644 index 000000000..060b7327f --- /dev/null +++ b/docs/source/api/browse.controllers.abs_page.rst @@ -0,0 +1,7 @@ +browse.controllers.abs_page module +======================================== + +.. automodule:: browse.controllers.abs_page + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.controllers.home_page.rst b/docs/source/api/browse.controllers.home_page.rst new file mode 100644 index 000000000..7d050838e --- /dev/null +++ b/docs/source/api/browse.controllers.home_page.rst @@ -0,0 +1,7 @@ +browse.controllers.home_page module +======================================== + +.. automodule:: browse.controllers.home_page + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.controllers.list_page.rst b/docs/source/api/browse.controllers.list_page.rst new file mode 100644 index 000000000..d337a6bdd --- /dev/null +++ b/docs/source/api/browse.controllers.list_page.rst @@ -0,0 +1,7 @@ +browse.controllers.list_page module +======================================== + +.. automodule:: browse.controllers.list_page + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.controllers.prevnext.rst b/docs/source/api/browse.controllers.prevnext.rst new file mode 100644 index 000000000..3f6cefa63 --- /dev/null +++ b/docs/source/api/browse.controllers.prevnext.rst @@ -0,0 +1,7 @@ +browse.controllers.prevnext module +======================================== + +.. automodule:: browse.controllers.prevnext + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.controllers.rst b/docs/source/api/browse.controllers.rst new file mode 100644 index 000000000..0eff93e9b --- /dev/null +++ b/docs/source/api/browse.controllers.rst @@ -0,0 +1,17 @@ +browse.controllers package +========================== + +.. automodule:: browse.controllers + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + browse.controllers.abs_page + browse.controllers.home_page + browse.controllers.list_page + browse.controllers.prevnext \ No newline at end of file diff --git a/docs/source/api/browse.domain.category.rst b/docs/source/api/browse.domain.category.rst new file mode 100644 index 000000000..58a599871 --- /dev/null +++ b/docs/source/api/browse.domain.category.rst @@ -0,0 +1,7 @@ +browse.domain.category module +============================= + +.. automodule:: browse.domain.category + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.domain.identifier.rst b/docs/source/api/browse.domain.identifier.rst new file mode 100644 index 000000000..644c40ea1 --- /dev/null +++ b/docs/source/api/browse.domain.identifier.rst @@ -0,0 +1,7 @@ +browse.domain.identifier module +=============================== + +.. automodule:: browse.domain.identifier + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.domain.institution.rst b/docs/source/api/browse.domain.institution.rst new file mode 100644 index 000000000..434608d81 --- /dev/null +++ b/docs/source/api/browse.domain.institution.rst @@ -0,0 +1,7 @@ +browse.domain.institution module +================================ + +.. automodule:: browse.domain.institution + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.domain.license.rst b/docs/source/api/browse.domain.license.rst new file mode 100644 index 000000000..d6b7422c8 --- /dev/null +++ b/docs/source/api/browse.domain.license.rst @@ -0,0 +1,7 @@ +browse.domain.license module +============================= + +.. automodule:: browse.domain.license + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.domain.listing.rst b/docs/source/api/browse.domain.listing.rst new file mode 100644 index 000000000..37a94f172 --- /dev/null +++ b/docs/source/api/browse.domain.listing.rst @@ -0,0 +1,7 @@ +browse.domain.listing module +============================= + +.. automodule:: browse.domain.listing + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.domain.metadata.rst b/docs/source/api/browse.domain.metadata.rst new file mode 100644 index 000000000..f8cf62a04 --- /dev/null +++ b/docs/source/api/browse.domain.metadata.rst @@ -0,0 +1,7 @@ +browse.domain.metadata module +=============================== + +.. automodule:: browse.domain.metadata + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.domain.rst b/docs/source/api/browse.domain.rst new file mode 100644 index 000000000..1add291d4 --- /dev/null +++ b/docs/source/api/browse.domain.rst @@ -0,0 +1,19 @@ +browse.domain package +===================== + +.. automodule:: browse.domain + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + browse.domain.category + browse.domain.identifier + browse.domain.institution + browse.domain.license + browse.domain.listing + browse.domain.metadata diff --git a/docs/source/api/browse.exceptions.rst b/docs/source/api/browse.exceptions.rst new file mode 100644 index 000000000..b228e3e2d --- /dev/null +++ b/docs/source/api/browse.exceptions.rst @@ -0,0 +1,7 @@ +browse.exceptions module +======================================== + +.. automodule:: browse.exceptions + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.factory.rst b/docs/source/api/browse.factory.rst new file mode 100644 index 000000000..3de36913e --- /dev/null +++ b/docs/source/api/browse.factory.rst @@ -0,0 +1,7 @@ +browse.factory module +======================================== + +.. automodule:: browse.factory + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.filters.rst b/docs/source/api/browse.filters.rst new file mode 100644 index 000000000..a45502856 --- /dev/null +++ b/docs/source/api/browse.filters.rst @@ -0,0 +1,7 @@ +browse.filters module +======================================== + +.. automodule:: browse.filters + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.routes.rst b/docs/source/api/browse.routes.rst new file mode 100644 index 000000000..c68fe08c3 --- /dev/null +++ b/docs/source/api/browse.routes.rst @@ -0,0 +1,14 @@ +browse.routes package +===================== + +.. automodule:: browse.routes + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + browse.routes.ui diff --git a/docs/source/api/browse.routes.ui.rst b/docs/source/api/browse.routes.ui.rst new file mode 100644 index 000000000..2f1c7c58f --- /dev/null +++ b/docs/source/api/browse.routes.ui.rst @@ -0,0 +1,7 @@ +browse.routes.ui module +======================= + +.. automodule:: browse.routes.ui + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.rst b/docs/source/api/browse.rst new file mode 100644 index 000000000..076cff262 --- /dev/null +++ b/docs/source/api/browse.rst @@ -0,0 +1,28 @@ +browse package +============== + +.. automodule:: browse + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + browse.controllers + browse.domain + browse.routes + browse.services + browse.util + +Submodules +---------- + +.. toctree:: + + browse.config + browse.exceptions + browse.factory + browse.filters diff --git a/docs/source/api/browse.services.database.models.rst b/docs/source/api/browse.services.database.models.rst new file mode 100644 index 000000000..fc7530bc1 --- /dev/null +++ b/docs/source/api/browse.services.database.models.rst @@ -0,0 +1,7 @@ +browse.services.database.models module +====================================== + +.. automodule:: browse.services.database.models + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.services.database.rst b/docs/source/api/browse.services.database.rst new file mode 100644 index 000000000..d2bef259f --- /dev/null +++ b/docs/source/api/browse.services.database.rst @@ -0,0 +1,14 @@ +browse.services.database package +================================ + +.. automodule:: browse.services.database + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + browse.services.database.models diff --git a/docs/source/api/browse.services.document.author_affil.rst b/docs/source/api/browse.services.document.author_affil.rst new file mode 100644 index 000000000..1322b6ccc --- /dev/null +++ b/docs/source/api/browse.services.document.author_affil.rst @@ -0,0 +1,7 @@ +browse.services.document.author_affil module +============================================ + +.. automodule:: browse.services.document.author_affil + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.services.document.cache.rst b/docs/source/api/browse.services.document.cache.rst new file mode 100644 index 000000000..684e37e14 --- /dev/null +++ b/docs/source/api/browse.services.document.cache.rst @@ -0,0 +1,7 @@ +browse.services.document.cache module +===================================== + +.. automodule:: browse.services.document.cache + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.services.document.metadata.rst b/docs/source/api/browse.services.document.metadata.rst new file mode 100644 index 000000000..40a8aae95 --- /dev/null +++ b/docs/source/api/browse.services.document.metadata.rst @@ -0,0 +1,7 @@ +browse.services.document.metadata module +======================================== + +.. automodule:: browse.services.document.metadata + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.services.document.rst b/docs/source/api/browse.services.document.rst new file mode 100644 index 000000000..1d76663da --- /dev/null +++ b/docs/source/api/browse.services.document.rst @@ -0,0 +1,17 @@ +browse.services.document package +================================ + +.. automodule:: browse.services.document + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + browse.services.document.author_affil + browse.services.document.cache + browse.services.document.metadata + diff --git a/docs/source/api/browse.services.listing.fake_listings.rst b/docs/source/api/browse.services.listing.fake_listings.rst new file mode 100644 index 000000000..16ddd84e6 --- /dev/null +++ b/docs/source/api/browse.services.listing.fake_listings.rst @@ -0,0 +1,7 @@ +browse.services.listing.fake_listings module +============================================== + +.. automodule:: browse.services.listing.fake_listings + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.services.listing.rst b/docs/source/api/browse.services.listing.rst new file mode 100644 index 000000000..52011e20c --- /dev/null +++ b/docs/source/api/browse.services.listing.rst @@ -0,0 +1,14 @@ +browse.services.listing package +================================ + +.. automodule:: browse.services.listing + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + browse.services.listing.fake_listings diff --git a/docs/source/api/browse.services.rst b/docs/source/api/browse.services.rst new file mode 100644 index 000000000..d7189959b --- /dev/null +++ b/docs/source/api/browse.services.rst @@ -0,0 +1,18 @@ +browse.services package +======================= + +.. automodule:: browse.services + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + browse.services.database + browse.services.document + browse.services.listing + browse.services.search + browse.services.util diff --git a/docs/source/api/browse.services.search.rst b/docs/source/api/browse.services.search.rst new file mode 100644 index 000000000..a13da02c9 --- /dev/null +++ b/docs/source/api/browse.services.search.rst @@ -0,0 +1,14 @@ +browse.services.search package +================================ + +.. automodule:: browse.services.search + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + browse.services.search.search_authors diff --git a/docs/source/api/browse.services.search.search_authors.rst b/docs/source/api/browse.services.search.search_authors.rst new file mode 100644 index 000000000..d929b7ec0 --- /dev/null +++ b/docs/source/api/browse.services.search.search_authors.rst @@ -0,0 +1,7 @@ +browse.services.search.search_authors module +============================================ + +.. automodule:: browse.services.search.search_authors + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.services.util.email.rst b/docs/source/api/browse.services.util.email.rst new file mode 100644 index 000000000..657e9b271 --- /dev/null +++ b/docs/source/api/browse.services.util.email.rst @@ -0,0 +1,7 @@ +browse.services.util.email module +============================================ + +.. automodule:: browse.services.util.email + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.services.util.external_refs_cits.rst b/docs/source/api/browse.services.util.external_refs_cits.rst new file mode 100644 index 000000000..ade155369 --- /dev/null +++ b/docs/source/api/browse.services.util.external_refs_cits.rst @@ -0,0 +1,7 @@ +browse.services.util.external_refs_cits module +================================================ + +.. automodule:: browse.services.util.external_refs_cits + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.services.util.formats.rst b/docs/source/api/browse.services.util.formats.rst new file mode 100644 index 000000000..fa2293015 --- /dev/null +++ b/docs/source/api/browse.services.util.formats.rst @@ -0,0 +1,7 @@ +browse.services.util.formats module +================================================ + +.. automodule:: browse.services.util.formats + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.services.util.metatags.rst b/docs/source/api/browse.services.util.metatags.rst new file mode 100644 index 000000000..6623c2d9b --- /dev/null +++ b/docs/source/api/browse.services.util.metatags.rst @@ -0,0 +1,7 @@ +browse.services.util.metatags module +================================================ + +.. automodule:: browse.services.util.metatags + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.services.util.response_headers.rst b/docs/source/api/browse.services.util.response_headers.rst new file mode 100644 index 000000000..15daf9438 --- /dev/null +++ b/docs/source/api/browse.services.util.response_headers.rst @@ -0,0 +1,7 @@ +browse.services.util.response_headers module +================================================ + +.. automodule:: browse.services.util.response_headers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.services.util.rst b/docs/source/api/browse.services.util.rst new file mode 100644 index 000000000..a20abf25c --- /dev/null +++ b/docs/source/api/browse.services.util.rst @@ -0,0 +1,19 @@ +browse.services.util package +================================ + +.. automodule:: browse.services.util + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + browse.services.util.email + browse.services.util.external_refs_cits + browse.services.util.formats + browse.services.util.metatags + browse.services.util.response_headers + browse.services.util.tex2utf diff --git a/docs/source/api/browse.services.util.tex2utf.rst b/docs/source/api/browse.services.util.tex2utf.rst new file mode 100644 index 000000000..070b736dd --- /dev/null +++ b/docs/source/api/browse.services.util.tex2utf.rst @@ -0,0 +1,7 @@ +browse.services.util.tex2utf module +================================================ + +.. automodule:: browse.services.util.tex2utf + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/browse.util.rst b/docs/source/api/browse.util.rst new file mode 100644 index 000000000..2491304cb --- /dev/null +++ b/docs/source/api/browse.util.rst @@ -0,0 +1,7 @@ +browse.util module +======================================== + +.. automodule:: browse.util + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/modules.rst b/docs/source/api/modules.rst new file mode 100644 index 000000000..797a8bd73 --- /dev/null +++ b/docs/source/api/modules.rst @@ -0,0 +1,7 @@ +browse +====== + +.. toctree:: + :maxdepth: 4 + + browse \ No newline at end of file diff --git a/docs/architecture.rst b/docs/source/architecture.rst similarity index 99% rename from docs/architecture.rst rename to docs/source/architecture.rst index a361d771d..e1445bd71 100644 --- a/docs/architecture.rst +++ b/docs/source/architecture.rst @@ -1,5 +1,5 @@ -Browse Application -****************** +Browse Application Architecture +******************************* The browse application provides reader-facing views onto arXiv documents and their metadata. "Browse" functionality encompasses the abstract page, content diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 000000000..e941fe2ed --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) +import os +import sys +sys.path.insert(0, os.path.abspath('.')) +sys.path.append(os.path.abspath('..')) +sys.path.append(os.path.abspath('../..')) + +# -- Project information ----------------------------------------------------- + +project = 'arXiv Browse Service' +copyright = '2019, arXiv-NG Team' +author = 'arXiv-NG Team' + +# The short X.Y version +version = '' +# The full version, including alpha/beta/rc tags +release = '0.2.1' + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.intersphinx', + 'sphinx.ext.coverage', + 'sphinx.ext.viewcode', + 'sphinx.ext.githubpages', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = None + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'arXivBrowseServicedoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'arXivBrowseService.tex', 'arXiv Browse Service Documentation', + 'arXiv-NG Team', 'manual'), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'arxivbrowseservice', 'arXiv Browse Service Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'arXivBrowseService', 'arXiv Browse Service Documentation', + author, 'arXivBrowseService', 'One line description of project.', + 'Miscellaneous'), +] + + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ['search.html'] + + +# -- Extension configuration ------------------------------------------------- + +# -- Options for intersphinx extension --------------------------------------- + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'https://docs.python.org/': None} diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 000000000..9f01e357a --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,23 @@ +.. arXiv Browse Service documentation master file, created by + sphinx-quickstart on Wed Feb 13 16:27:27 2019. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to arXiv Browse Service's documentation! +================================================ + +.. toctree:: + :maxdepth: 3 + :caption: Contents: + + architecture + api/modules + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/tests/data/abs_files/ftp/arxiv/papers/1902/1902.11195.abs b/tests/data/abs_files/ftp/arxiv/papers/1902/1902.11195.abs new file mode 100644 index 000000000..8e2c26b7f --- /dev/null +++ b/tests/data/abs_files/ftp/arxiv/papers/1902/1902.11195.abs @@ -0,0 +1,46 @@ +------------------------------------------------------------------------------ +\\ +arXiv:1902.11195 +From: Israel Mardor +Date: Thu, 28 Feb 2019 16:26:35 GMT (2136kb,D) + +Title: A Novel Method for the Measurement of Half-Lives and Decay Branching + Ratios of Exotic Nuclei +Authors: Ivan Miskun (1), Timo Dickel (1 and 2), Israel Mardor (3 and 4), + Christine Hornung (1), Daler Amanbayev (1), Samuel Ayet San Andr\'es (1 and + 2), Julian Bergmann (1), Jens Ebert (1), Hans Geissel (1 and 2), Magdalena + G\'orska (2), Florian Greiner (1), Emma Haettner (2), Wolfgang R. Pla{\ss} (1 + and 2), Sivaji Purushothaman (2), Christoph Scheidenberger (1 and 2), + Ann-Kathrin Rink (1), Helmut Weick (2), Soumya Bagchi (1 and 2 and 6), Paul + Constantin (5), Satbir Kaur (6), Wayne Lippert (1), Bo Mei (5), Iain Moore + (7), Jan-Hendrick Otto (1), Stephane Pietri (2), Ilkka Pohjalainen (7), + Andrej Prochazka (2), Christoph Rappold (1 and 2), Moritz P. Reiter (1 and + 8), Yoshiki K. Tanaka (2), John S. Winfield (2), for the Super-FRS Experiment + Collaboration ((1) Justus-Liebig-Universit\"at, Gie{\ss}en, Germany (2) GSI + Helmholtzzentrum f\"ur Schwerionenforschung GmbH, Darmstadt, Germany (3) Tel + Aviv University, Tel Aviv, Israel (4) Soreq Nuclear Research Center, Yavne, + Israel (5) IFIN-HH/ELI-NP, M\u{a}gurele - Bucharest, Romania (6) Saint Mary's + University, Halifax, Canada (7) University of Jyv\"askyl\"a, Jyv\"askyl\"a, + Finland (8) TRIUMF, Vancouver, Canada) +Categories: nucl-ex +Comments: 11 pages, 9 figures, 3 tables. Submitted to European Physics Journal + A +License: http://arxiv.org/licenses/nonexclusive-distrib/1.0/ +\\ + A novel method for simultaneous measurement of masses, Q-values, isomer +excitation energies, half-lives and decay branching ratios of exotic nuclei has +been demonstrated. The method includes first use of a stopping cell as an ion +trap, combining containment of precursors and decay-recoils for variable +durations in a cryogenic stopping cell (CSC), and afterwards the identification +and counting of them by a multiple-reflection time-of-flight mass spectrometer +(MR-TOF-MS). Feasibility has been established by recording the decay and growth +of $^{216}$Po and $^{212}$Pb (alpha decay) and of $^{119m2}$Sb (t$_{1/2}$ = +850$\pm$90 ms) and $^{119g}$Sb (isomer transition), obtaining half-lives and +branching ratios consistent with literature values. Hardly any +non-nuclear-decay losses have been observed in the CSC for up to $\sim$10 +seconds, which exhibits its extraordinary cleanliness. For $^{119}$Sb, this is +the first direct measurement of the ground and second isomeric state masses, +resolving the discrepancies in previous excitation energy data. These results +pave the way for the measurement of branching ratios of exotic nuclei with +multiple decay channels. +\\ diff --git a/tests/data/abs_files/orig/math-ph/papers/0702031v1.abs b/tests/data/abs_files/orig/math-ph/papers/0702031v1.abs deleted file mode 100644 index 5809d6dec..000000000 --- a/tests/data/abs_files/orig/math-ph/papers/0702031v1.abs +++ /dev/null @@ -1,17 +0,0 @@ ------------------------------------------------------------------------------- -\\ -arXiv:math-ph/0702031 -From: Paulo Mendon\c{c}a -Date: Fri, 9 Feb 2007 20:39:05 GMT (8kb) - -Title: Probability Distribution of Curvatures of Isosurfaces in Gaussian Random - Fields -Authors: Paulo R. S. Mendonca, Rahul Bhotika and James V. Miller -Categories: math-ph.MP -Comments: Seven pages, six references -MSC-class: 60D05 -\\ - An expression for the joint probability distribution of the principal -curvatures at an arbitrary point in the ensemble of isosurfaces defined on -isotropic Gaussian random fields on Rn is derived. -\\ diff --git a/tests/data/browse.db b/tests/data/browse.db index 3ca1a9d37..10847cc3e 100644 Binary files a/tests/data/browse.db and b/tests/data/browse.db differ diff --git a/tests/data/daily_stats b/tests/data/daily_stats new file mode 100644 index 000000000..31ea233b8 --- /dev/null +++ b/tests/data/daily_stats @@ -0,0 +1 @@ +total_papers 1234567 diff --git a/tests/data/db/sql/arXiv_in_category.sql b/tests/data/db/sql/arXiv_in_category.sql new file mode 100644 index 000000000..8e691811a --- /dev/null +++ b/tests/data/db/sql/arXiv_in_category.sql @@ -0,0 +1,9 @@ +INSERT INTO `arXiv_in_category` VALUES (1,'cond-mat','stat-mech',0),(1,'math','CO',1),(1,'math','MP',0),(1,'math-ph','',0); +INSERT INTO `arXiv_in_category` VALUES (2,'math','AG',1),(2,'math','NT',0); +INSERT INTO `arXiv_in_category` VALUES (3,'math','DG',1); +INSERT INTO `arXiv_in_category` VALUES (4,'math','CT',0),(4,'math','KT',1); +INSERT INTO `arXiv_in_category` VALUES (5,'gr-qc','',1); +INSERT INTO `arXiv_in_category` VALUES (6,'gr-qc','',1); +INSERT INTO `arXiv_in_category` VALUES (7,'cond-mat','mtrl-sci',0),(7,'cond-mat','supr-con',1); +INSERT INTO `arXiv_in_category` VALUES (422353,'cs','IT',1),(422353,'math','IT',0); +INSERT INTO `arXiv_in_category` VALUES (504934,'hep-ex','',1); diff --git a/tests/data/db/sql/arXiv_stats_hourly.sql b/tests/data/db/sql/arXiv_stats_hourly.sql new file mode 100644 index 000000000..9f75799cf --- /dev/null +++ b/tests/data/db/sql/arXiv_stats_hourly.sql @@ -0,0 +1,5 @@ +-- full day's data for 2019-01-012 +INSERT INTO `arXiv_stats_hourly` VALUES ('2019-01-02',0,3,'N',53845),('2019-01-02',0,2,'N',48406),('2019-01-02',0,4,'N',48828),('2019-01-02',0,1,'N',51429),('2019-01-02',0,1,'A',3),('2019-01-02',1,3,'N',56725),('2019-01-02',1,2,'N',53353),('2019-01-02',1,2,'A',3),('2019-01-02',1,4,'N',48812),('2019-01-02',1,1,'N',51958),('2019-01-02',1,1,'A',5),('2019-01-02',2,3,'N',59992),('2019-01-02',2,2,'N',61270),('2019-01-02',2,1,'N',54667),('2019-01-02',2,1,'A',2),('2019-01-02',2,4,'N',50583),('2019-01-02',3,3,'N',66846),('2019-01-02',3,2,'N',63584),('2019-01-02',3,4,'N',62662),('2019-01-02',3,4,'A',2),('2019-01-02',3,1,'N',61056),('2019-01-02',3,1,'A',4),('2019-01-02',4,3,'N',59514),('2019-01-02',4,1,'N',53321),('2019-01-02',4,1,'A',2),('2019-01-02',4,2,'N',63700),('2019-01-02',4,4,'N',55601),('2019-01-02',5,3,'N',62566),('2019-01-02',5,3,'A',2),('2019-01-02',5,1,'N',57549),('2019-01-02',5,1,'A',12),('2019-01-02',5,4,'N',56604),('2019-01-02',5,2,'N',61001),('2019-01-02',5,2,'A',7),('2019-01-02',6,1,'N',56351),('2019-01-02',6,1,'A',5),('2019-01-02',6,3,'N',63059),('2019-01-02',6,4,'N',54324),('2019-01-02',6,2,'N',59236),('2019-01-02',6,2,'A',3),('2019-01-02',7,3,'N',59573),('2019-01-02',7,2,'N',61090),('2019-01-02',7,4,'N',54211),('2019-01-02',7,1,'N',57702),('2019-01-02',7,1,'A',2),('2019-01-02',8,3,'N',70691),('2019-01-02',8,2,'N',72144),('2019-01-02',8,4,'N',73142),('2019-01-02',8,1,'N',32346),('2019-01-02',8,1,'A',2),('2019-01-02',9,3,'N',65412),('2019-01-02',9,4,'N',60758),('2019-01-02',9,1,'N',56908),('2019-01-02',9,1,'A',4),('2019-01-02',9,2,'N',65283),('2019-01-02',9,2,'A',2),('2019-01-02',10,3,'N',68411),('2019-01-02',10,4,'N',60375),('2019-01-02',10,1,'N',59069),('2019-01-02',10,1,'A',2),('2019-01-02',10,2,'N',63476),('2019-01-02',11,3,'N',63408),('2019-01-02',11,3,'A',2),('2019-01-02',11,1,'N',54618),('2019-01-02',11,1,'A',19),('2019-01-02',11,4,'N',58673),('2019-01-02',11,2,'N',58308),('2019-01-02',12,3,'N',58050),('2019-01-02',12,1,'N',51337),('2019-01-02',12,1,'A',8),('2019-01-02',12,2,'N',57397),('2019-01-02',12,4,'N',54196),('2019-01-02',13,2,'N',51752),('2019-01-02',13,1,'N',45888),('2019-01-02',13,1,'A',2),('2019-01-02',13,3,'N',52890),('2019-01-02',13,4,'N',52570),('2019-01-02',14,4,'N',47099),('2019-01-02',14,1,'N',42504),('2019-01-02',14,1,'A',2),('2019-01-02',14,3,'N',49769),('2019-01-02',14,2,'N',47754),('2019-01-02',15,3,'N',51613),('2019-01-02',15,1,'N',42363),('2019-01-02',15,1,'A',6),('2019-01-02',15,2,'N',46362),('2019-01-02',15,4,'N',44812),('2019-01-02',16,3,'N',44893),('2019-01-02',16,4,'N',42667),('2019-01-02',16,2,'N',46088),('2019-01-02',16,1,'N',39196),('2019-01-02',16,1,'A',2),('2019-01-02',17,3,'N',42756),('2019-01-02',17,2,'N',44652),('2019-01-02',17,2,'A',4),('2019-01-02',17,1,'N',37109),('2019-01-02',17,1,'A',12),('2019-01-02',17,4,'N',40573),('2019-01-02',17,4,'A',5),('2019-01-02',18,3,'N',39766),('2019-01-02',18,4,'N',35739),('2019-01-02',18,4,'A',3),('2019-01-02',18,1,'N',34420),('2019-01-02',18,1,'A',5),('2019-01-02',18,2,'N',40016),('2019-01-02',19,4,'N',38157),('2019-01-02',19,3,'N',39220),('2019-01-02',19,2,'N',42100),('2019-01-02',19,1,'N',34453),('2019-01-02',19,1,'A',2),('2019-01-02',20,4,'N',53578),('2019-01-02',20,3,'N',58880),('2019-01-02',20,2,'N',61952),('2019-01-02',20,1,'N',56445),('2019-01-02',20,1,'A',2),('2019-01-02',21,4,'N',59692),('2019-01-02',21,2,'N',62295),('2019-01-02',21,2,'A',2843),('2019-01-02',21,3,'N',69497),('2019-01-02',21,1,'N',63502),('2019-01-02',21,1,'A',3116),('2019-01-02',22,4,'N',48579),('2019-01-02',22,2,'N',53989),('2019-01-02',22,3,'N',55692),('2019-01-02',22,1,'N',54425),('2019-01-02',22,1,'A',2570),('2019-01-02',23,4,'N',46401),('2019-01-02',23,4,'A',327),('2019-01-02',23,2,'N',48631),('2019-01-02',23,2,'A',1521),('2019-01-02',23,3,'N',50670),('2019-01-02',23,1,'N',51057),('2019-01-02',23,1,'A',1852); + +-- partial day's data for 2019-03-19 +INSERT INTO `arXiv_stats_hourly` VALUES ('2019-03-19',0,4,'N',52840),('2019-03-19',0,4,'A',3),('2019-03-19',0,2,'N',53704),('2019-03-19',0,3,'N',55367),('2019-03-19',0,3,'A',276),('2019-03-19',0,1,'N',63019),('2019-03-19',0,1,'A',6035),('2019-03-19',1,3,'N',57338),('2019-03-19',1,3,'A',667),('2019-03-19',1,2,'N',56893),('2019-03-19',1,4,'N',58308),('2019-03-19',1,4,'A',209),('2019-03-19',1,1,'N',69393),('2019-03-19',1,1,'A',7040),('2019-03-19',2,3,'N',65608),('2019-03-19',2,3,'A',282),('2019-03-19',2,2,'N',64133),('2019-03-19',2,4,'N',65662),('2019-03-19',2,4,'A',617),('2019-03-19',2,1,'N',78694),('2019-03-19',2,1,'A',3468),('2019-03-19',3,4,'N',41070),('2019-03-19',3,4,'A',163),('2019-03-19',3,2,'N',75674),('2019-03-19',3,2,'A',68),('2019-03-19',3,3,'N',74601),('2019-03-19',3,3,'A',540),('2019-03-19',3,1,'N',82732),('2019-03-19',3,1,'A',1012),('2019-03-19',4,4,'N',82631),('2019-03-19',4,4,'A',641),('2019-03-19',4,3,'N',82636),('2019-03-19',4,3,'A',709),('2019-03-19',4,2,'N',74236),('2019-03-19',4,1,'N',81817),('2019-03-19',4,1,'A',1339),('2019-03-19',5,1,'N',83531),('2019-03-19',5,1,'A',502),('2019-03-19',5,4,'N',80857),('2019-03-19',5,4,'A',303),('2019-03-19',5,3,'N',80824),('2019-03-19',5,3,'A',198),('2019-03-19',5,2,'N',78596),('2019-03-19',6,1,'N',75874),('2019-03-19',6,1,'A',977),('2019-03-19',6,3,'N',76941),('2019-03-19',6,2,'N',74382),('2019-03-19',6,4,'N',75905),('2019-03-19',7,2,'N',75759),('2019-03-19',7,1,'N',82056),('2019-03-19',7,1,'A',1908),('2019-03-19',7,3,'N',77887),('2019-03-19',7,4,'N',79405),('2019-03-19',8,1,'N',89503),('2019-03-19',8,1,'A',1975),('2019-03-19',8,2,'N',79577),('2019-03-19',8,4,'N',88666),('2019-03-19',8,3,'N',84634); diff --git a/tests/data/db/sql/arXiv_stats_monthly_downloads.sql b/tests/data/db/sql/arXiv_stats_monthly_downloads.sql new file mode 100644 index 000000000..eb9d7f18f --- /dev/null +++ b/tests/data/db/sql/arXiv_stats_monthly_downloads.sql @@ -0,0 +1 @@ +INSERT INTO `arXiv_stats_monthly_downloads` VALUES ('1994-01-01',0),('1994-02-01',1863),('1994-03-01',7048),('1994-04-01',6814),('1994-05-01',9601),('1994-06-01',11007),('1994-07-01',9488),('1994-08-01',10789),('1994-09-01',12872),('1994-10-01',16260),('1994-11-01',22392),('1994-12-01',20070),('1995-01-01',24801),('1995-02-01',28607),('1995-03-01',35418),('1995-04-01',34109),('1995-05-01',35232),('1995-06-01',52415),('1995-07-01',68341),('1995-08-01',95848),('1995-09-01',114550),('1995-10-01',146524),('1995-11-01',151624),('1995-12-01',132908),('1996-01-01',160742),('1996-02-01',186528),('1996-03-01',193169),('1996-04-01',198175),('1996-05-01',221716),('1996-06-01',213072),('1996-07-01',230395),('1996-08-01',222063),('1996-09-01',251249),('1996-10-01',287532),('1996-11-01',263905),('1996-12-01',225928),('1997-01-01',250816),('1997-02-01',230357),('1997-03-01',239669),('1997-04-01',258302),('1997-05-01',267683),('1997-06-01',261706),('1997-07-01',270550),('1997-08-01',239698),('1997-09-01',287158),('1997-10-01',326150),('1997-11-01',296375),('1997-12-01',288487),('1998-01-01',321044),('1998-02-01',337514),('1998-03-01',407658),('1998-04-01',384583),('1998-05-01',409663),('1998-06-01',404232),('1998-07-01',412267),('1998-08-01',382223),('1998-09-01',444628),('1998-10-01',510790),('1998-11-01',537888),('1998-12-01',474617),('1999-01-01',508224),('1999-02-01',464452),('1999-03-01',511665),('1999-04-01',510014),('1999-05-01',536821),('1999-06-01',547141),('1999-07-01',544712),('1999-08-01',491688),('1999-09-01',568449),('1999-10-01',582934),('1999-11-01',665411),('1999-12-01',627284),('2000-01-01',640945),('2000-02-01',725616),('2000-03-01',751154),('2000-04-01',697884),('2000-05-01',785650),('2000-06-01',745325),('2000-07-01',728494),('2000-08-01',721467),('2000-09-01',724899),('2000-10-01',828870),('2000-11-01',925260),('2000-12-01',844998),('2001-01-01',935314),('2001-02-01',972075),('2001-03-01',1055737),('2001-04-01',1013386),('2001-05-01',1059624),('2001-06-01',902582),('2001-07-01',846039),('2001-08-01',865395),('2001-09-01',834963),('2001-10-01',969584),('2001-11-01',966553),('2001-12-01',1145430),('2002-01-01',1034782),('2002-02-01',982129),('2002-03-01',1175829),('2002-04-01',1321376),('2002-05-01',1423935),('2002-06-01',1236836),('2002-07-01',1348311),('2002-08-01',1270201),('2002-09-01',1516847),('2002-10-01',1675586),('2002-11-01',1515041),('2002-12-01',1343960),('2003-01-01',1588531),('2003-02-01',1572793),('2003-03-01',1711279),('2003-04-01',1620843),('2003-05-01',1703389),('2003-06-01',1601569),('2003-07-01',1586144),('2003-08-01',1504793),('2003-09-01',1820701),('2003-10-01',2117036),('2003-11-01',2033657),('2003-12-01',2119846),('2004-01-01',2109079),('2004-02-01',2293511),('2004-03-01',2221885),('2004-04-01',2100595),('2004-05-01',2301904),('2004-06-01',2036838),('2004-07-01',1986919),('2004-08-01',1997330),('2004-09-01',2191867),('2004-10-01',1927789),('2004-11-01',1978766),('2004-12-01',1767095),('2005-01-01',1804435),('2005-02-01',1803019),('2005-03-01',2068684),('2005-04-01',2251800),('2005-05-01',2136947),('2005-06-01',1951444),('2005-07-01',1852432),('2005-08-01',1852131),('2005-09-01',2120662),('2005-10-01',2346256),('2005-11-01',2406316),('2005-12-01',2360494),('2006-01-01',1701784),('2006-02-01',1737794),('2006-03-01',2298683),('2006-04-01',1997694),('2006-05-01',2214161),('2006-06-01',1801721),('2006-07-01',1401479),('2006-08-01',1629104),('2006-09-01',1768658),('2006-10-01',1982461),('2006-11-01',1930368),('2006-12-01',1663348),('2007-01-01',2029217),('2007-02-01',2066818),('2007-03-01',2134854),('2007-04-01',1979950),('2007-05-01',2224762),('2007-06-01',2224194),('2007-07-01',2051276),('2007-08-01',1822372),('2007-09-01',1998723),('2007-10-01',2446094),('2007-11-01',2531762),('2007-12-01',2159429),('2008-01-01',2665839),('2008-02-01',2557077),('2008-03-01',2848757),('2008-04-01',2810180),('2008-05-01',2958761),('2008-06-01',2706793),('2008-07-01',2730226),('2008-08-01',2450019),('2008-09-01',3278451),('2008-10-01',2937969),('2008-11-01',3066092),('2008-12-01',2638793),('2009-01-01',2598159),('2009-02-01',2644797),('2009-03-01',3104692),('2009-04-01',2837273),('2009-05-01',2877553),('2009-06-01',2792334),('2009-07-01',2667955),('2009-08-01',2538405),('2009-09-01',2907441),('2009-10-01',3167801),('2009-11-01',2912668),('2009-12-01',2723394),('2010-01-01',3116867),('2010-02-01',2992581),('2010-03-01',3459004),('2010-04-01',3292459),('2010-05-01',3293995),('2010-06-01',3335937),('2010-07-01',3098569),('2010-08-01',2879133),('2010-09-01',3144001),('2010-10-01',3684560),('2010-11-01',3829250),('2010-12-01',3775736),('2011-01-01',3439915),('2011-02-01',3339054),('2011-03-01',4117851),('2011-04-01',3659749),('2011-05-01',3997359),('2011-06-01',3707592),('2011-07-01',3771042),('2011-08-01',3774913),('2011-09-01',4973988),('2011-10-01',5563412),('2011-11-01',4962036),('2011-12-01',4433059),('2012-01-01',4796243),('2012-02-01',5078432),('2012-03-01',5516536),('2012-04-01',5423005),('2012-05-01',5067807),('2012-06-01',5073183),('2012-07-01',5624113),('2012-08-01',6040066),('2012-09-01',5372689),('2012-10-01',5904272),('2012-11-01',5373061),('2012-12-01',4549990),('2013-01-01',5665096),('2013-02-01',5136321),('2013-03-01',6121433),('2013-04-01',5552584),('2013-05-01',6184081),('2013-06-01',5033496),('2013-07-01',5251361),('2013-08-01',4985314),('2013-09-01',5319428),('2013-10-01',6146267),('2013-11-01',5953050),('2013-12-01',5464673),('2014-01-01',6701768),('2014-02-01',6175371),('2014-03-01',7371741),('2014-04-01',7207160),('2014-05-01',6814576),('2014-06-01',6650843),('2014-07-01',6934640),('2014-08-01',6368537),('2014-09-01',7129287),('2014-10-01',9150599),('2014-11-01',10771479),('2014-12-01',9877539),('2015-01-01',10211128),('2015-02-01',10375127),('2015-03-01',12826855),('2015-04-01',11289672),('2015-05-01',11182540),('2015-06-01',12568030),('2015-07-01',9585300),('2015-08-01',9180739),('2015-09-01',10741431),('2015-10-01',13882451),('2015-11-01',14200441),('2015-12-01',10286698),('2016-01-01',16227193),('2016-02-01',14547141),('2016-03-01',16144490),('2016-04-01',14330220),('2016-05-01',14042457),('2016-06-01',14344946),('2016-07-01',12885660),('2016-08-01',11795801),('2016-09-01',13958435),('2016-10-01',11202303),('2016-11-01',11492492),('2016-12-01',12136801),('2017-01-01',12150875),('2017-02-01',12870204),('2017-03-01',14319435),('2017-04-01',15628477),('2017-05-01',14894937),('2017-06-01',14372378),('2017-07-01',14808475),('2017-08-01',15915733),('2017-09-01',16999890),('2017-10-01',19433648),('2017-11-01',18885644),('2017-12-01',17361253),('2018-01-01',18445580),('2018-02-01',17704444),('2018-03-01',20149872),('2018-04-01',19709932),('2018-05-01',19710718),('2018-06-01',18088369),('2018-07-01',18132320),('2018-08-01',17906302),('2018-09-01',17418053),('2018-10-01',19443085),('2018-11-01',23040835),('2018-12-01',18694165),('2019-01-01',19281479); diff --git a/tests/data/db/sql/arXiv_stats_monthly_submissions.sql b/tests/data/db/sql/arXiv_stats_monthly_submissions.sql new file mode 100644 index 000000000..23932c0d6 --- /dev/null +++ b/tests/data/db/sql/arXiv_stats_monthly_submissions.sql @@ -0,0 +1 @@ +INSERT INTO `arXiv_stats_monthly_submissions` VALUES ('1991-07-01',2,-2),('1991-08-01',28,-1),('1991-09-01',58,0),('1991-10-01',76,0),('1991-11-01',64,0),('1991-12-01',78,0),('1992-01-01',193,-105),('1992-02-01',134,-10),('1992-03-01',120,-3),('1992-04-01',225,-41),('1992-05-01',237,-11),('1992-06-01',237,-5),('1992-07-01',296,-22),('1992-08-01',227,-4),('1992-09-01',336,-21),('1992-10-01',401,-29),('1992-11-01',453,-16),('1992-12-01',404,-10),('1993-01-01',400,-36),('1993-02-01',428,-16),('1993-03-01',504,-9),('1993-04-01',505,-17),('1993-05-01',538,-7),('1993-06-01',536,-16),('1993-07-01',643,-35),('1993-08-01',541,-25),('1993-09-01',518,-16),('1993-10-01',678,-36),('1993-11-01',714,-16),('1993-12-01',738,-15),('1994-01-01',610,-28),('1994-02-01',653,-19),('1994-03-01',753,-24),('1994-04-01',738,-39),('1994-05-01',841,-18),('1994-06-01',888,-32),('1994-07-01',863,-23),('1994-08-01',766,-20),('1994-09-01',884,-33),('1994-10-01',932,-23),('1994-11-01',1114,-39),('1994-12-01',1055,-33),('1995-01-01',933,-36),('1995-02-01',977,-36),('1995-03-01',1168,-37),('1995-04-01',913,-31),('1995-05-01',1110,-22),('1995-06-01',1195,-27),('1995-07-01',1017,-25),('1995-08-01',1030,-22),('1995-09-01',1133,-29),('1995-10-01',1229,-23),('1995-11-01',1162,-28),('1995-12-01',1147,-37),('1996-01-01',1051,-22),('1996-02-01',1076,-23),('1996-03-01',1169,-23),('1996-04-01',1220,-49),('1996-05-01',1358,-42),('1996-06-01',1311,-34),('1996-07-01',1424,-32),('1996-08-01',1394,-19),('1996-09-01',1474,-21),('1996-10-01',1520,-30),('1996-11-01',1402,-27),('1996-12-01',1467,-33),('1997-01-01',1308,-31),('1997-02-01',1362,-34),('1997-03-01',1411,-24),('1997-04-01',1468,-34),('1997-05-01',1581,-32),('1997-06-01',1709,-31),('1997-07-01',1804,-41),('1997-08-01',1426,-15),('1997-09-01',1882,-37),('1997-10-01',2007,-26),('1997-11-01',1717,-32),('1997-12-01',1949,-31),('1998-01-01',1723,-11),('1998-02-01',1668,0),('1998-03-01',1911,-3),('1998-04-01',1747,-12),('1998-05-01',1918,-2),('1998-06-01',2089,-6),('1998-07-01',2101,-12),('1998-08-01',1826,-6),('1998-09-01',2414,-9),('1998-10-01',2330,-33),('1998-11-01',2242,-14),('1998-12-01',2203,-7),('1999-01-01',1850,-23),('1999-02-01',1919,-3),('1999-03-01',2394,-15),('1999-04-01',2156,-9),('1999-05-01',2221,-17),('1999-06-01',2427,-5),('1999-07-01',2414,-23),('1999-08-01',2139,-1),('1999-09-01',2502,-12),('1999-10-01',2571,-6),('1999-11-01',2518,-29),('1999-12-01',2593,-6),('2000-01-01',2365,-6),('2000-02-01',2366,0),('2000-03-01',2600,-9),('2000-04-01',2077,-9),('2000-05-01',2727,-9),('2000-06-01',2439,-3),('2000-07-01',2461,-16),('2000-08-01',2613,-1),('2000-09-01',2522,-7),('2000-10-01',2913,-3),('2000-11-01',2864,-11),('2000-12-01',2654,-1),('2001-01-01',2591,-14),('2001-02-01',2429,-3),('2001-03-01',2713,-8),('2001-04-01',2606,-6),('2001-05-01',2912,-4),('2001-06-01',2871,-5),('2001-07-01',2760,-3),('2001-08-01',2422,0),('2001-09-01',2548,-3),('2001-10-01',3404,-4),('2001-11-01',3240,-1),('2001-12-01',2718,-19),('2002-01-01',2716,-4),('2002-02-01',2567,-2),('2002-03-01',2666,-3),('2002-04-01',2853,-4),('2002-05-01',3089,-2),('2002-06-01',2695,0),('2002-07-01',3269,-6),('2002-08-01',2725,0),('2002-09-01',3314,-2),('2002-10-01',3531,0),('2002-11-01',3440,-3),('2002-12-01',3256,-35),('2003-01-01',2926,0),('2003-02-01',2881,0),('2003-03-01',3018,0),('2003-04-01',3102,0),('2003-05-01',3267,0),('2003-06-01',3463,0),('2003-07-01',3418,0),('2003-08-01',2741,0),('2003-09-01',3747,-12),('2003-10-01',3841,0),('2003-11-01',3349,0),('2003-12-01',3661,0),('2004-01-01',3071,0),('2004-02-01',3277,-1),('2004-03-01',3603,0),('2004-04-01',3369,-1),('2004-05-01',3553,0),('2004-06-01',3710,0),('2004-07-01',3679,0),('2004-08-01',3294,0),('2004-09-01',3939,0),('2004-10-01',4086,0),('2004-11-01',4152,0),('2004-12-01',3994,0),('2005-01-01',3493,0),('2005-02-01',3250,0),('2005-03-01',3881,0),('2005-04-01',3693,0),('2005-05-01',3779,0),('2005-06-01',3984,0),('2005-07-01',3839,0),('2005-08-01',3775,0),('2005-09-01',4345,-16),('2005-10-01',4439,0),('2005-11-01',4294,0),('2005-12-01',4083,0),('2006-01-01',3858,0),('2006-02-01',3520,0),('2006-03-01',4213,0),('2006-04-01',3474,0),('2006-05-01',4204,0),('2006-06-01',4139,0),('2006-07-01',4197,0),('2006-08-01',4064,0),('2006-09-01',4275,0),('2006-10-01',5133,0),('2006-11-01',4854,0),('2006-12-01',4296,0),('2007-01-01',4653,0),('2007-02-01',4164,0),('2007-03-01',4493,0),('2007-04-01',4003,0),('2007-05-01',4684,0),('2007-06-01',4484,0),('2007-07-01',4681,0),('2007-08-01',4414,0),('2007-09-01',4682,0),('2007-10-01',5945,0),('2007-11-01',5029,0),('2007-12-01',4406,0),('2008-01-01',4970,0),('2008-02-01',4463,0),('2008-03-01',4519,0),('2008-04-01',4898,0),('2008-05-01',4836,0),('2008-06-01',4980,0),('2008-07-01',5139,0),('2008-08-01',4160,0),('2008-09-01',5287,0),('2008-10-01',5773,0),('2008-11-01',4774,0),('2008-12-01',5116,0),('2009-01-01',4975,0),('2009-02-01',4903,0),('2009-03-01',5547,0),('2009-04-01',4930,0),('2009-05-01',4955,0),('2009-06-01',5614,0),('2009-07-01',5606,0),('2009-08-01',4597,0),('2009-09-01',5696,0),('2009-10-01',5957,0),('2009-11-01',5730,0),('2009-12-01',5537,0),('2010-01-01',5471,0),('2010-02-01',5048,0),('2010-03-01',6133,0),('2010-04-01',5602,0),('2010-05-01',5737,0),('2010-06-01',5963,0),('2010-07-01',5521,0),('2010-08-01',5399,0),('2010-09-01',6232,0),('2010-10-01',6304,0),('2010-11-01',6676,0),('2010-12-01',6045,0),('2011-01-01',6081,0),('2011-02-01',5774,0),('2011-03-01',6286,0),('2011-04-01',5711,0),('2011-05-01',6374,0),('2011-06-01',6357,0),('2011-07-01',6046,0),('2011-08-01',6331,0),('2011-09-01',6937,0),('2011-10-01',6930,0),('2011-11-01',7319,0),('2011-12-01',6432,0),('2012-01-01',6687,0),('2012-02-01',6685,0),('2012-03-01',6903,0),('2012-04-01',6739,0),('2012-05-01',7092,0),('2012-06-01',7121,0),('2012-07-01',7358,0),('2012-08-01',6592,0),('2012-09-01',6630,0),('2012-10-01',8452,0),('2012-11-01',7370,0),('2012-12-01',6974,0),('2013-01-01',7750,0),('2013-02-01',7317,0),('2013-03-01',7476,0),('2013-04-01',8135,0),('2013-05-01',7516,0),('2013-06-01',6944,0),('2013-07-01',8447,0),('2013-08-01',6833,0),('2013-09-01',7995,0),('2013-10-01',8658,0),('2013-11-01',7692,0),('2013-12-01',7878,0),('2014-01-01',8294,0),('2014-02-01',7374,0),('2014-03-01',8154,0),('2014-04-01',7855,0),('2014-05-01',7975,0),('2014-06-01',7874,0),('2014-07-01',8549,0),('2014-08-01',7119,0),('2014-09-01',8676,0),('2014-10-01',8871,0),('2014-11-01',8006,0),('2014-12-01',8770,0),('2015-01-01',7912,0),('2015-02-01',8054,0),('2015-03-01',9191,0),('2015-04-01',8367,0),('2015-05-01',8172,0),('2015-06-01',9217,0),('2015-07-01',8995,0),('2015-08-01',7983,0),('2015-09-01',9318,0),('2015-10-01',9223,0),('2015-11-01',9472,0),('2015-12-01',9376,0),('2016-01-01',8251,0),('2016-02-01',9142,0),('2016-03-01',9746,0),('2016-04-01',8948,0),('2016-05-01',9792,0),('2016-06-01',9644,0),('2016-07-01',8911,0),('2016-08-01',9016,0),('2016-09-01',9869,0),('2016-10-01',10100,0),('2016-11-01',10362,0),('2016-12-01',9599,0),('2017-01-01',9186,0),('2017-02-01',8910,0),('2017-03-01',11008,0),('2017-04-01',9029,0),('2017-05-01',11194,0),('2017-06-01',10297,0),('2017-07-01',9980,0),('2017-08-01',9854,0),('2017-09-01',10517,0),('2017-10-01',11627,0),('2017-11-01',11589,0),('2017-12-01',10332,0),('2018-01-01',10609,0),('2018-02-01',10593,0),('2018-03-01',11560,0),('2018-04-01',11352,0),('2018-05-01',12595,0),('2018-06-01',11568,0),('2018-07-01',11938,0),('2018-08-01',10870,0),('2018-09-01',11171,0),('2018-10-01',13446,0),('2018-11-01',12941,0),('2018-12-01',11973,0),('2019-01-01',11537,0); diff --git a/tests/data/db/sql/arXiv_trackback_pings.sql b/tests/data/db/sql/arXiv_trackback_pings.sql index ba0b01357..8fd0575f8 100644 --- a/tests/data/db/sql/arXiv_trackback_pings.sql +++ b/tests/data/db/sql/arXiv_trackback_pings.sql @@ -1,3 +1,4 @@ +-- 0808.4142 INSERT INTO `arXiv_trackback_pings` VALUES (695736,504934,'Omega b: the new baryon nailed by D0 « A Quantum Diaries Survivor',' pulling this new result off! You can read the details of the analysis in this paper. ','http://dorigo.wordpress.com/2008/09/19/omega-b-the-new-baryon-nailed-by-d0/','','arxiv1.library.cornell.edu','128.84.158.114',1221882921,0,0,1221883202,'accepted',61); INSERT INTO `arXiv_trackback_pings` VALUES (751383,504934,'Nit-Picking On The Omega_b Discovery','quark and two strange quarks- has been recently published by the DZERO collaboration. Their paper claims to observe the so-far-unseen','http://www.scientificblogging.com/quantum_diaries_survivor/nitpicking_omega_b_discovery','A Quantum Diaries Survivor','arxiv1.library.cornell.edu','128.84.158.114',1243482527,0,0,1243484559,'accepted',NULL); INSERT INTO `arXiv_trackback_pings` VALUES (751384,504934,'Nit-Picking On The Omega_b Baryon: Part II','the pains to check the calculation hinted at on the scientific paper where the discovery is claimed, which was sent to the ','http://www.scientificblogging.com/quantum_diaries_survivor/nitpicking_omega_b_baryon_part_ii','A Quantum Diaries Survivor','arxiv1.library.cornell.edu','128.84.158.114',1243482588,0,0,1243484559,'accepted',NULL); @@ -12,5 +13,4 @@ INSERT INTO `arXiv_trackback_pings` VALUES (1,504934,'More On The Omega B Baryon -- This next one is an intentional duplicate, but with different url, and not accepted INSERT INTO `arXiv_trackback_pings` VALUES (2,504934,'Foo baz','\"Observation of the','http://en.wikipedia.org/wiki/Zevatron','Wikipedia','arxiv1.library.cornell.edu','128.84.158.114',1228752458,0,0,0,'rejected',NULL); -INSERT INTO `arXiv_trackback_pings` VALUES (99999999,99999998,'Nothing here','Really nothing here ','http://www.example.org','On Nothing','arxiv1.library.cornell.edu','128.84.158.114',1258488297,0,0,1248558336,'accepted',NULL); - +INSERT INTO `arXiv_trackback_pings` VALUES (99999999,504934,'Nothing here','Really nothing here ','http://www.example.org','On Nothing','arxiv1.library.cornell.edu','128.84.158.114',1258488294,0,0,1248558336,'accepted',NULL); diff --git a/tests/legacy_comparison/archive_config.py b/tests/legacy_comparison/archive_config.py new file mode 100644 index 000000000..964ccf083 --- /dev/null +++ b/tests/legacy_comparison/archive_config.py @@ -0,0 +1,12 @@ +from response_comparisons import compare_status + +res_comparisons = [ compare_status ] +text_comparisons = [] +html_comparisons = [] + +def ng_id_to_url_fn(id:str)->str: + return 'http://localhost:5000/archive/' + id + +def legacy_id_to_url_fn(id:str)->str: + return 'https://arxiv.org/archive/' + id + diff --git a/tests/legacy_comparison/archives.txt b/tests/legacy_comparison/archives.txt new file mode 100644 index 000000000..df3f31942 --- /dev/null +++ b/tests/legacy_comparison/archives.txt @@ -0,0 +1,20 @@ +astro-ph +cond-mat +gr-qc +hep-ex +hep-lat +hep-ph +hep-th +math-ph +nlin +nucl-ex +nucl-th +physics +quant-ph +math +cs +q-bio +q-fin +stat +eess +econ diff --git a/tests/legacy_comparison/html_comparisons.py b/tests/legacy_comparison/html_comparisons.py index 9d7a5eda7..f130db375 100644 --- a/tests/legacy_comparison/html_comparisons.py +++ b/tests/legacy_comparison/html_comparisons.py @@ -1,10 +1,10 @@ -from functools import partial +from functools import partial, update_wrapper from typing import Callable, List import re -from tests.legacy_comparison.abstract_comparisons import lev_similarity -from tests.legacy_comparison.comparison_types import html_arg_dict, BadResult +from abstract_comparisons import lev_similarity +from comparison_types import html_arg_dict, BadResult from bs4 import BeautifulSoup, element @@ -15,7 +15,9 @@ def html_similarity(html_arg: html_arg_dict) -> BadResult: html_arg['legacy_html'].prettify() ) if sim < 0.69: - return BadResult(html_arg['paper_id'], f"html_pretty_sim for {html_arg['paper_id']} = {sim}") + return BadResult(html_arg['id'], + "html_similarity", + f"html_pretty_sim for {html_arg['id']} = {sim}") return None @@ -34,7 +36,7 @@ def to_label(tr): return tr.find('td', 'label').contents if ng_labels == legacy_labels: return None else: - return BadResult(html_arg['paper_id'], + return BadResult(html_arg['id'], "Metadata field included on NG do not match those from legacy" + f"NG: {ng_labels} Legacy: {legacy_labels}") @@ -69,14 +71,14 @@ def _element_similarity(name: str, if required: if len(ng) == 0 and len(legacy) == 0: - return BadResult(html_arg['paper_id'], name, - f"Missing field {name} for {html_arg['paper_id']} from NG and Legacy") + return BadResult(html_arg['id'], name, + f"Missing field {name} for {html_arg['id']} from NG and Legacy") if len(ng) == 0: - return BadResult(html_arg['paper_id'], name, - f"Missing field {name} for {html_arg['paper_id']} from NG") + return BadResult(html_arg['id'], name, + f"Missing field {name} for {html_arg['id']} from NG") if len(legacy) == 0: - return BadResult(html_arg['paper_id'], name, - f"Missing field {name} for {html_arg['paper_id']} from legacy") + return BadResult(html_arg['id'], name, + f"Missing field {name} for {html_arg['id']} from legacy") if check_counts and (len(legacy) != len(ng)): if ng: @@ -88,8 +90,8 @@ def _element_similarity(name: str, else: legacy_ele_txt = 'MISSING' - return BadResult(html_arg['paper_id'], name, - f"bad counts for {name} for {html_arg['paper_id']} ng: {len(ng)} legacy: {len(legacy)}", + return BadResult(html_arg['id'], name, + f"bad counts for {name} for {html_arg['id']} ng: {len(ng)} legacy: {len(legacy)}", legacy_ele_txt, ng_ele_txt) ng_ele_txt = '' @@ -102,9 +104,11 @@ def _element_similarity(name: str, if sim < min_sim: msg = f"Elements did not meet min similarity of {min_sim}" - return BadResult(html_arg['paper_id'], name, msg, legacy_ele_txt, + return BadResult(html_arg['id'], name, msg, legacy_ele_txt, ng_ele_txt, sim) - return None + + msg = f"GOOD: Elements did meet min similarity of {min_sim}" + return BadResult(html_arg['id'], name, msg, '','', sim) else: if not required: return None @@ -116,7 +120,7 @@ def _element_similarity(name: str, msg = 'zero elements detected: ' \ + f'legacy length was {len(legacy)}; ng length was {len(ng)} ' - return BadResult(html_arg['paper_id'], name, msg, legacy_ele_txt, + return BadResult(html_arg['id'], name, msg, legacy_ele_txt, ng_ele_txt, 0.0) @@ -197,3 +201,32 @@ def ex_strip(eles: List[BeautifulSoup]): bookmarks_similarity = partial(_element_similarity, 'extra bookmarks div', lambda bs: ex_strip(bs.select('.bookmarks')), 0.9, False, True) + +################# /archive checks ################################ + +archive_h1_similarity = partial(_element_similarity, 'top heading', + lambda bs: ex_strip(bs.select('#content > h1')), + 0.99, True, True) + +archive_browse = partial(_element_similarity, 'browse', + lambda bs: ex_strip(bs.select('#content > ul > li:nth-child(1)')), + 0.99, True, True) + +archive_catchup = partial(_element_similarity, 'archive catchup', + lambda bs: ex_strip(bs.select('#content > ul > li:nth-child(2)')), + 0.99, True, True) + +archive_search= partial(_element_similarity, 'archive_search', + lambda bs: ex_strip(bs.select('#content > ul > li:nth-child(3)')), + 0.99, True, True) + +archive_by_year= partial(_element_similarity, 'archive_by_year', + lambda bs: ex_strip(bs.select('#content > ul > li:nth-child(4)')), + 0.99, True, True) + + +archive_bogus= partial(_element_similarity, 'bogus_should_fail', + lambda bs: ex_strip(bs.select('.bogusClass')), + 0.99, True, True) + + diff --git a/tests/legacy_comparison/page_comparison.py b/tests/legacy_comparison/page_comparison.py new file mode 100644 index 000000000..de577dad1 --- /dev/null +++ b/tests/legacy_comparison/page_comparison.py @@ -0,0 +1,295 @@ +import argparse +import itertools +import sys +import traceback +import os +import re +from functools import partial +import multiprocessing_on_dill as mp +from typing import Callable, Iterator, List, Set, Tuple, Dict, Any +import gzip +import logging +import json + +import requests +from bs4 import BeautifulSoup + +# BDC34: some how I need this under pipenv to get to browse, not sure why +sys.path.append('') +sys.setrecursionlimit(10000) + +from comparison_types import res_comparison_fn, \ + text_comparison_fn, html_comparison_fn, res_arg_dict, text_arg_dict, \ + html_arg_dict, BadResult +from html_comparisons import html_similarity, metadata_fields_similarity, \ + archive_h1_similarity,archive_catchup,archive_search,archive_by_year,archive_browse,\ + archive_bogus + +from response_comparisons import compare_status +from text_comparisons import text_similarity + +import archive_config + + +""" Script to compare pages from NG and beta.arxiv.org + +To run this I do: +Open terminal: +cd arxiv-browse +pipenv sync +FLASK_APP=app.py FLASK_DEBUG=1 pipenv run flask run + +In another terminal: +cd arxiv-browse +pipenv sync +pipenv shell +python tests/legacy_comparison/page_comparison.py + +To reset the analysis to start over, add the `--reset` arg. +To run a short test add '--short' arg. + +To skip ancillary file comparisons: '--skip-ancillary' + +Improvements: + Better reporting format, right now the comparisons produce just strings. +""" + +logging.basicConfig(filename="page_comparison.log", level=logging.DEBUG) + +#This just renames None nicely, they are tests that passed +SUCCESS = None + +LOG_FILE_NAME = 'legacy_comparison.org' + + +# TODO abstract this or move this out +VISITED_FILE_NAME = 'visited.log' + +def ident(x): + return x + +configs = { + 'archive': { # I'd like to do something with modules but it doesn't pickle. + 'comparisons': [compare_status, + html_similarity, + archive_h1_similarity, + archive_browse, + archive_catchup, + archive_search, + archive_by_year, + ], + 'ng_id_to_url_fn': archive_config.ng_id_to_url_fn, + 'legacy_id_to_url_fn': archive_config.legacy_id_to_url_fn, + 'ng_txt_trans_fn': ident, + 'legacy_txt_trans_fn': ident, + } +} + +# id file is one id per line +def _id_generator_from_file(path: str, excluded: List[str])->Iterator[str]: + if 'gzip' in path or 'gz' in path: + with gzip.open(path, 'rt') as f: + for line in f: + aid = line.strip() + if aid not in excluded: + logging.debug(f'yielding id {aid}') + yield aid + else: + with open(path, 'rt') as f: + for line in f: + aid = line.strip() + if aid not in excluded: + logging.debug(f'yielding id {aid}') + yield aid + +#TODO generalize +# Should end with / +#ng_abs_base_url = 'http://localhost:5000/abs/' +#ng_abs_base_url = 'https://beta.arxiv.org/abs/' + +#TODO generalize +# Should end with / +#legacy_abs_base_url = 'https://beta.arxiv.org/abs_classic/' + +def fetch_pages(config: Dict, id: str) ->Dict: + """Fetch NG and Legacy.""" + ng_url = config['ng_id_to_url_fn'](id) + legacy_url = config['legacy_id_to_url_fn'](id) + + headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} + ng_res = requests.get(ng_url, headers=headers) + legacy_res = requests.get(legacy_url, headers=headers) + + ng_text = config['ng_txt_trans_fn'](ng_res.text) + legacy_text = config['ng_txt_trans_fn'](legacy_res.text) + + res_dict: res_arg_dict = { + 'id': id, + 'ng_url': ng_url, + 'legacy_url': legacy_url, + } + + # to avoid json problems + with_html = { + **res_dict, + 'ng_res': ng_res, + 'legacy_res': legacy_res, + 'ng_text': ng_text, + 'legacy_text': legacy_text, + 'ng_html': BeautifulSoup(ng_text, 'html.parser'), + 'legacy_html': BeautifulSoup(legacy_text, 'html.parser'), + } + return (res_dict, compare_response(config, + with_html)) + + +def compare_response(config: Dict, + res_args: res_arg_dict) -> Iterator[BadResult]: + """Do the response comparisions, the kick off the text comparisions.""" +# protected_comps = [protect(fn) for fn in config['comparisons']] + protected_comps = config['comparisons'] + logging.debug(f"about to do compares for {res_args['id']}") + return filter(SUCCESS, [fn(res_args) for fn in protected_comps]) + + +def multi_ws_to_single_ws(txt:str)->str: + return ' '.join(txt.split()) #white space to single spaces + + +def main() -> None: + parser = argparse.ArgumentParser(description='Compare ng pages to legacy pages') + parser.add_argument('--idfile', default=False, ) + parser.add_argument('--reset', default=False, const=True, + action='store_const', dest='reset') + parser.add_argument('--short', default=False, const=True, + action='store_const', dest='short') + parser.add_argument('--config', default=False) + args = parser.parse_args() + + print('Starting config') + if args.config not in configs.keys(): + raise ValueError(f"No config named '{args.config}' choose one of [{' '.join(configs.keys())}]") + else: + print(f'Using config {args.config}') + active_config = configs[args.config] + print('done with config') + + visited: Set[str] = [] + if args.reset: + print('Restarting analysis and deleting logs!') + if os.path.exists(LOG_FILE_NAME): + os.remove(LOG_FILE_NAME) + if os.path.exists(VISITED_FILE_NAME): + os.remove(VISITED_FILE_NAME) + else: + if os.path.exists(VISITED_FILE_NAME): + print('Continuing analysis') + with open(VISITED_FILE_NAME, 'r') as visited_fh: + visited = {line.rstrip() for line in visited_fh.readlines()} + + ids = _id_generator_from_file(args.idfile, excluded=visited) + + if args.short: + n=0 + total = 10 + logging.info(f'Doing short list of {n}') + def done()->bool: + nonlocal n + if n >= total: + return True + n = n + 1 + return False + else: + def done()->bool: + return False + + f_then_c = partial( fetch_pages, active_config) + + with open(VISITED_FILE_NAME, 'a', buffering=1) as visited_fh: + logging.debug(f'Opened {VISITED_FILE_NAME} to find already visited ids') + with open(LOG_FILE_NAME, 'w', buffering=1)as report_fh: + logging.debug(f'Opened {LOG_FILE_NAME} to write report to') + with mp.Pool(4) as pool: + completed_jobs \ + = pool.imap_unordered(f_then_c, ids) + + def done_job(job): + (res_dict, bad_results) = job + logging.debug(f"completed {res_dict['id']}") + visited_fh.write(f"{res_dict['id']}\n") + write_comparison_org(report_fh, (res_dict, list(bad_results))) + if done(): + logging.info("done and existing") + exit(0) + + [done_job(job) for job in completed_jobs] + + + +def protect(fn: Callable[[Any],BadResult] )-> Callable[[Any],BadResult]: + """Return function that will not throw""" + def protected(res_args:Dict) -> BadResult: + # noinspection PyBroadException + try: + return fn(res_args) + except Exception as ex: + return BadResult(res_args['id'], "name unknown", traceback.format_exc()) + return protected + + +def _serialize_obj(obj): + """JSON serializer for objects not serializable by default json code""" + return obj.__dict__ + +def write_comparison(report_fh, result: Tuple[Dict, List[BadResult]]) -> None: + (config, bad_results) = result + logging.debug("writing report for %s", config['id']) + report_fh.write( json.dumps(config, sort_keys=True) + "\n") + if bad_results: + report_fh.write( json.dumps(bad_results, sort_keys=True, default=_serialize_obj) + "\n") + else: + report_fh.write("no bad results\n") + +def write_comparison_org(report_fh, result: Tuple[Dict, List[BadResult]]) -> None: + (config, bad_results) = result + logging.debug("writing report for %s", config['id']) + report_fh.write(f"* {config['id']} \n") + + report_fh.write(f"** config for {config['id']}\n") + report_fh.write(json.dumps(config, sort_keys=True) + "\n") + + report_fh.write("** Results\n") + if bad_results: + for result in bad_results: + report_fh.write(f"*** {result.comparison} \n") + #report_fh.write( json.dumps(result, sort_keys=True, default=_serialize_obj) + "\n") + report_fh.write(result.message + "\n") + report_fh.write(f"**** NG value: \n{result.ng}\n") + report_fh.write(f"**** Legacy value: \n{result.legacy}\n") + else: + report_fh.write("No bad results\n") + + +def format_bad_result(bad: BadResult)->str: + rpt = f"** {bad.comparison}\n" \ + f"{bad.message} " + if bad.similarity: + rpt = rpt + f"sim: {bad.similarity}\n" + else: + rpt = rpt + "\n" + + if bad.legacy or bad.ng: + rpt = rpt + f"Legacy: '{bad.legacy}'\nNG: '{bad.ng}'\n" + + return rpt + + +#this is just to get around passing the config to the pool +def dict_from_module(module): + context = {} + for setting in required_keys: + context[setting] = getattr(module, setting) + return context + +if __name__ == '__main__': + main() diff --git a/tests/legacy_comparison/response_comparisons.py b/tests/legacy_comparison/response_comparisons.py index 83143e2a8..e6d48e552 100644 --- a/tests/legacy_comparison/response_comparisons.py +++ b/tests/legacy_comparison/response_comparisons.py @@ -1,4 +1,4 @@ -from tests.legacy_comparison.comparison_types import res_arg_dict, BadResult +from comparison_types import res_arg_dict, BadResult def compare_status(res_arg: res_arg_dict) -> BadResult: diff --git a/tests/legacy_comparison/text_comparisons.py b/tests/legacy_comparison/text_comparisons.py index 50740cf2f..4dd019e18 100644 --- a/tests/legacy_comparison/text_comparisons.py +++ b/tests/legacy_comparison/text_comparisons.py @@ -1,5 +1,5 @@ -from tests.legacy_comparison.abstract_comparisons import lev_similarity -from tests.legacy_comparison.comparison_types import text_arg_dict, BadResult +from abstract_comparisons import lev_similarity +from comparison_types import text_arg_dict, BadResult def text_similarity(text_arg: text_arg_dict) -> BadResult: diff --git a/tests/test_404.py b/tests/test_404.py index 1cb9cb4a8..5babaeae5 100644 --- a/tests/test_404.py +++ b/tests/test_404.py @@ -1,10 +1,11 @@ import unittest -import logging +import logging from app import app - -class It_Should_be_404_Test(unittest.TestCase): - + + +class Test_404(unittest.TestCase): + def setUp(self): """Disable logging to avoid messy output during testing""" import logging @@ -15,11 +16,10 @@ def setUp(self): app.config['APPLICATION_ROOT'] = '' self.app = app.test_client() - def its_should_be_404_test(self): + def test_it_should_be_404(self): rv = self.app.get('/abs?archive=foo&papernum=1234567') self.assertEqual(rv.status_code, 404) rv = self.app.get('/abs?0704.0600') self.assertEqual(rv.status_code, 404, 'singleton case for new IDs not supported') - diff --git a/tests/test_archive.py b/tests/test_archive.py new file mode 100644 index 000000000..b4b80b7e8 --- /dev/null +++ b/tests/test_archive.py @@ -0,0 +1,71 @@ +import unittest + +from app import app + + +class BrowseTest(unittest.TestCase): + def setUp(self): + app.testing = True + app.config["APPLICATION_ROOT"] = "" + self.app = app.test_client() + + def test_astroph_archive(self): + rv = self.app.get("/archive/astro-ph") + self.assertEqual(rv.status_code, 200) + self.assertIn('Expires', rv.headers, 'Should have expires header') + + src = rv.data.decode("utf-8") + self.assertIn("Astrophysics", src) + self.assertIn("/year/astro-ph/92", src) + self.assertIn("/year/astro-ph/19", src) + + self.assertIn( + "Astrophysics of Galaxies", + src, + "Subcategories of astro-ph should be on archive page", + ) + self.assertIn( + "Earth and Planetary Astrophysics", + src, + "Subcategories of astro-ph should be on archive page", + ) + + def test_list(self): + rv = self.app.get("/archive/list") + self.assertEqual(rv.status_code, 200) + src = rv.data.decode("utf-8") + + self.assertIn("Astrophysics", src) + self.assertIn("astro-ph", src) + + self.assertIn("Materials Theory", src) + self.assertIn("mtrl-th", src) + + rv = self.app.get("/archive/bogus-archive") + self.assertEqual(rv.status_code, 404) + + def test_subsumed_archive(self): + rv = self.app.get("/archive/comp-lg") + self.assertEqual(rv.status_code, 404) + src = rv.data.decode("utf-8") + + self.assertIn("Computer Science", src) + self.assertIn("cs.CL", src) + + rv = self.app.get("/archive/acc-phys") + self.assertEqual(rv.status_code, 200) + src = rv.data.decode("utf-8") + + self.assertIn("Accelerator Physics", src) + self.assertIn("physics.acc-ph", src) + + def test_single_archive(self): + rv = self.app.get("/archive/hep-ph") + self.assertEqual(rv.status_code, 200) + src = rv.data.decode("utf-8") + + self.assertIn("High Energy Physics", src) + self.assertNotIn("Categories within", src) + + + diff --git a/tests/test_author_affil.py b/tests/test_author_affil.py deleted file mode 100644 index e240ad166..000000000 --- a/tests/test_author_affil.py +++ /dev/null @@ -1,291 +0,0 @@ -"""Tests for author and affiliation parsing.""" -from unittest import TestCase - -from browse.services.document.author_affil import parse_author_affil, split_authors -from browse.services.document.metadata import AbsMetaSession -from tests import path_of_for_test - - -class TestAuthorAffiliationParsing(TestCase): - - def test_split_authors(self): - self.assertListEqual(split_authors('Simeon Warner'), - ['Simeon Warner']) - - self.assertListEqual(split_authors('The DELPHI Collaboration, J. Abdallah, et al'), - ['The DELPHI Collaboration', ',', 'J. Abdallah', ',', 'et al']) - - self.assertListEqual(split_authors('BELLE Collaboration: A Person, Nother Person'), - ['BELLE Collaboration', ':', 'A Person', ',', 'Nother Person']) - - self.assertListEqual(split_authors('Simeon Warner, Herbert Van de Sompel'), - ['Simeon Warner', ',', 'Herbert Van de Sompel']) - - self.assertListEqual( - split_authors('An Author'), - ['An Author'], - 'single author' - ) - - self.assertListEqual( - split_authors(''), - [], - 'empty author' - ) - - self.assertListEqual( - split_authors('An Author (affil)'), - ['An Author', '(affil)'], - 'single author with affil' - ) - self.assertListEqual( - split_authors('An Author (affil)'), - ['An Author', '(affil)'], - 'single author with affil' - ) - self.assertListEqual( - split_authors('An Author and Another P. H. J. Author (affil)'), - ['An Author', ',', 'Another P. H. J. Author', '(affil)'], - 'double author with affil' - ) - self.assertListEqual( - split_authors( - 'John Von Neumann, Herbert Van de Sompel, Fred Bloggs, Jr, et al'), - ['John Von Neumann', ',', 'Herbert Van de Sompel', - ',', 'Fred Bloggs, Jr', ',', 'et al'], - 'multiple with prefixes and suffixes' - ) - self.assertListEqual( - split_authors('sum won ( whatever affil data unmunged )'), - ['sum won', '( whatever affil data unmunged )'], - 'one author, two labs' - ) - self.assertListEqual( - split_authors('sum won(1,2)((1)lab a,(2)lab b)'), - ['sum won', '(1,2)', '((1)lab a,(2)lab b)'], - 'one author, two labs' - ) - - def test_parse_author_affil_basic(self): - self.assertListEqual(parse_author_affil( - 'Simeon Warner'), [['Warner', 'Simeon', '']]) - - self.assertListEqual( - parse_author_affil('Simeon Warner Jr'), - [['Warner', 'Simeon', 'Jr']]) - - self.assertListEqual( - parse_author_affil('Simeon Warner Jr.'), - [['Warner', 'Simeon', 'Jr.']]) - - self.assertListEqual( - parse_author_affil('Simeon Warner Sr'), - [['Warner', 'Simeon', 'Sr']]) - - self.assertListEqual( - parse_author_affil('Simeon Warner Sr.'), - [['Warner', 'Simeon', 'Sr.']]) - - self.assertListEqual( - parse_author_affil('SM Warner'), - [['Warner', 'SM', '']]) - - self.assertListEqual( - parse_author_affil('SM. Warner'), - [['Warner', 'SM.', '']]) - - def test_parse_author_affil_basic2(self): - self.assertListEqual( - parse_author_affil('S.M. Warner'), - [['Warner', 'S. M.', '']]) - - self.assertListEqual( - parse_author_affil('John Von Neumann'), - [['Von Neumann', 'John', '']]) - - self.assertListEqual( - parse_author_affil('Herbert Van de Sompel'), - [['Van de Sompel', 'Herbert', '']]) - - self.assertListEqual( - parse_author_affil('del Norte'), - [['Norte', 'del', '']]) - - self.assertListEqual( - parse_author_affil('Fred del Norte'), - [['del Norte', 'Fred', '']]) - - self.assertListEqual( - parse_author_affil('BELLE'), - [['BELLE', '', '']]) - - self.assertListEqual( - parse_author_affil('BELLE Collaboration: A Person, Nother Person'), - [ - ['BELLE Collaboration', '', ''], - ['Person', 'A', ''], - ['Person', 'Nother', ''] - ]) - - self.assertListEqual(parse_author_affil('The DELPHI Collaboration, J. Abdallah, et al'), - [['The DELPHI Collaboration', '', ''], ['Abdallah', 'J.', '']]) - - self.assertListEqual(parse_author_affil('Ali Vaziri Astaneh, Federico Fuentes'), - [['Vaziri Astaneh', 'Ali', ''],['Fuentes', 'Federico', '']]) - - def test_parse_author_affil_with_affiliations(self): - self.assertListEqual( - parse_author_affil('sum won (lab a)'), - [['won', 'sum', '', 'lab a']]) - - self.assertListEqual( - parse_author_affil('sum won (lab a; lab b)'), - [['won', 'sum', '', 'lab a; lab b']]) - - self.assertListEqual( - parse_author_affil('sum won (lab a, lab b)'), - [['won', 'sum', '', 'lab a, lab b']]) - - self.assertListEqual( - parse_author_affil('sum won (1,2) ( (1) lab a, (2) lab b)'), - [['won', 'sum', '', 'lab a', 'lab b']]) - - self.assertListEqual( - parse_author_affil('sum won(1,2)((1)lab a,(2)lab b)'), - [['won', 'sum', '', 'lab a', 'lab b']]) - - self.assertListEqual( - parse_author_affil('a.b.first, c.d.second (affil)'), - [['first', 'a. b.', '', 'affil'], ['second', 'c. d.', '', 'affil']]) - - self.assertListEqual( - parse_author_affil('a.b.first (affil), c.d.second (affil)'), - [['first', 'a. b.', '', 'affil'], ['second', 'c. d.', '', 'affil']]) - - self.assertListEqual( - parse_author_affil( - 'a.b.first, c.d.second (1), e.f.third, g.h.forth (2,3) ((1) affil1, (2) affil2, (3) affil3)' - ), - [ - ['first', 'a. b.', '', 'affil1'], - ['second', 'c. d.', '', 'affil1'], - ['third', 'e. f.', '', 'affil2', 'affil3'], - ['forth', 'g. h.', '', 'affil2', 'affil3'] - ]) - - self.assertListEqual( - parse_author_affil(( - "QUaD collaboration: S. Gupta (1), P. Ade (1), J. Bock (2,3), M. Bowden " - "(1,4), M. L. Brown (5), G. Cahill (6), P. G. Castro (7,8), S. Church (4), T. " - "Culverhouse (9), R. B. Friedman (9), K. Ganga (10), W. K. Gear (1), J. " - "Hinderks (5,11), J. Kovac (3), A. E. Lange (4), E. Leitch (2,3), S. J. " - "Melhuish (12), Y. Memari (7), J. A. Murphy (6), A. Orlando (1,3), C. " - "O'Sullivan (6), L. Piccirillo (12), C. Pryke (9), N. Rajguru (1,13), B. " - "Rusholme (4,14), R. Schwarz (9), A. N. Taylor (7), K. L. Thompson (4), A. H. " - "Turner (1), E. Y. S. Wu (4), M. Zemcov (1,2,3) ((1) Cardiff University, (2) " - "JPL, (3) Caltech, (4) Stanford University, (5) University of Cambridge, (6) " - "National University of Ireland Maynooth, (7) University of Edinburgh, (8) " - "Universidade Tecnica de Lisboa, (9) University of Chicago, (10) Laboratoire " - "APC/CNRS, (11) NASA Goddard, (12) University of Manchester, (13) UCL, (14) " - "IPAC) " - )), - [ - ["QUaD collaboration", "", ""], - ["Gupta", "S.", "", "Cardiff University"], - ["Ade", "P.", "", "Cardiff University"], - ["Bock", "J.", "", "JPL", "Caltech"], - ["Bowden", "M.", "", "Cardiff University", "Stanford University"], - ["Brown", "M. L.", "", "University of Cambridge"], - ["Cahill", "G.", "", "National University of Ireland Maynooth"], - [ - "Castro", "P. G.", "", - "University of Edinburgh", - "Universidade Tecnica de Lisboa" - ], - ["Church", "S.", "", "Stanford University"], - ["Culverhouse", "T.", "", "University of Chicago"], - ["Friedman", "R. B.", "", "University of Chicago"], - ["Ganga", "K.", "", "Laboratoire APC/CNRS"], - ["Gear", "W. K.", "", "Cardiff University"], - ["Hinderks", "J.", "", "University of Cambridge", "NASA Goddard"], - ["Kovac", "J.", "", "Caltech"], - ["Lange", "A. E.", "", "Stanford University"], - ["Leitch", "E.", "", "JPL", "Caltech"], - ["Melhuish", "S. J.", "", "University of Manchester"], - ["Memari", "Y.", "", "University of Edinburgh"], - ["Murphy", "J. A.", "", "National University of Ireland Maynooth"], - ["Orlando", "A.", "", "Cardiff University", "Caltech"], - ["O'Sullivan", "C.", "", "National University of Ireland Maynooth"], - ["Piccirillo", "L.", "", "University of Manchester"], - ["Pryke", "C.", "", "University of Chicago"], - ["Rajguru", "N.", "", "Cardiff University", "UCL"], - ["Rusholme", "B.", "", "Stanford University", "IPAC"], - ["Schwarz", "R.", "", "University of Chicago"], - ["Taylor", "A. N.", "", "University of Edinburgh"], - ["Thompson", "K. L.", "", "Stanford University"], - ["Turner", "A. H.", "", "Cardiff University"], - ["Wu", "E. Y. S.", "", "Stanford University"], - ["Zemcov", "M.", "", "Cardiff University", "JPL", "Caltech"] - ], - 'parse_author_affil (mind-blowing) 1/1' - ) - - # Problem case with 1110.4366 - self.assertListEqual( - parse_author_affil( - 'Matthew Everitt, Robert M. Heath and Viv Kendon'), - [['Everitt', 'Matthew', ''], - ['Heath', 'Robert M.', ''], - ['Kendon', 'Viv', '']], - 'parse_author_affil for 1110.4366' - ) - - # look like bugs, but aren't - self.assertListEqual( - parse_author_affil('sum won ((lab a), (lab b))'), - [['won', 'sum', '']], - 'parse_author_affil (bug imposter) 1/2' - ) - self.assertListEqual( - parse_author_affil('sum won ((lab a) (lab b))'), - [['won', 'sum', '']], - 'parse_author_affil (bug imposter) 2/2' - ) - - self.assertListEqual( - parse_author_affil('Anatoly Zlotnik and Jr-Shin Li'), - [['Zlotnik', 'Anatoly', ''], - ['Li', 'Jr-Shin', '']], - 'jr issue (Anatoly Zlotnik and Jr-Shin Li)' - ) - - # ====== Extra tests for arXiv::AuthorAffil ARXIVDEV-728 ====== - - # [parse_author_affil] - self.assertListEqual( - parse_author_affil(''), - [], - 'parse_author_affil (empty)' - ) - self.assertListEqual( - parse_author_affil('Simeon Warner Jr'), - [['Warner', 'Simeon', 'Jr']], - 'parse_author_affil (basic) 2/12' - ) - self.assertListEqual( - parse_author_affil('BELLE Collaboration'), - [['BELLE Collaboration', '', '']], - 'parse_author_affil (lone "BELLE Collaboration") 2/3' - ) - - self.assertListEqual( - parse_author_affil('BELLE Collaboration'), - [['BELLE Collaboration', '', '']], - 'parse_author_affil (lone "BELLE Collaboration") 2/3' - ) - - def test_collaboration_at_front(self): - f1 = path_of_for_test('data/abs_files/ftp/arxiv/papers/0808/0808.4142.abs') - meta = AbsMetaSession.parse_abs_file(filename=f1) - paflst = parse_author_affil(meta.authors.raw) - self.assertListEqual(paflst, [['D0 Collaboration', '', ''], ['Abazov', 'V.', '']]) diff --git a/tests/test_browse.py b/tests/test_browse.py index 2c6cc8098..d14bc12cb 100644 --- a/tests/test_browse.py +++ b/tests/test_browse.py @@ -1,8 +1,9 @@ import unittest from bs4 import BeautifulSoup - from tests.test_abs_parser import ABS_FILES + +from arxiv import taxonomy from browse.services.document.metadata import AbsMetaSession from browse.domain.license import ASSUMED_LICENSE_URI @@ -18,6 +19,104 @@ def setUp(self): app.config['APPLICATION_ROOT'] = '' self.app = app.test_client() + def test_home(self): + """Test the home page.""" + rv = self.app.get('/') + self.assertEqual(rv.status_code, 200) + html = BeautifulSoup(rv.data.decode('utf-8'), 'html.parser') + + for group_key, group_value in taxonomy.definitions.GROUPS.items(): + if group_key == 'grp_test': + continue + auths_elmt = html.find('h2', text=group_value['name']) + self.assertTrue(auths_elmt, f"{group_value['name']} in h2 element") + self.assertFalse(html.find('h2', text='Test'), + "'Test' group should not be shown on homepage") + + def test_tb(self): + """Test the /tb/ page.""" + rv = self.app.get('/tb/1901.99999') + self.assertEqual(rv.status_code, 404) + + rv = self.app.get('/tb/') + self.assertEqual(rv.status_code, 404) + + rv = self.app.get('/tb/foo') + self.assertEqual(rv.status_code, 404) + + rv = self.app.get('/tb/0808.4142') + self.assertEqual(rv.status_code, 200) + + html = BeautifulSoup(rv.data.decode('utf-8'), 'html.parser') + h2_elmt = html.find('h2') + h2_txt = h2_elmt.get_text() + self.assertTrue(h2_elmt, 'Should have

    element') + self.assertEquals(h2_txt, 'Trackbacks for 0808.4142') + tb_a_tags = html.find_all('a', 'mathjax', rel='external nofollow') + self.assertGreater(len(tb_a_tags), 1, + 'There should be more than one tag for trackbacks') + h1_elmt = html.find('div', id='abs') + h1_txt = h1_elmt.get_text() + self.assertTrue(h1_elmt, 'Should have

    element') + self.assertRegex( + h1_txt, + r'Observation of the doubly strange b baryon Omega_b-', + '

    element contains title of article') + + def test_tb_recent(self): + """Test the /tb/recent page.""" + rv = self.app.get('/tb/recent') + self.assertEqual(rv.status_code, 200) + + rv = self.app.post('/tb/recent', data=dict(views='50')) + self.assertEqual(rv.status_code, 200, 'POST with integer OK') + + rv = self.app.post('/tb/recent', data=dict(views='bar')) + self.assertEqual(rv.status_code, 400, 'POST with non-integer not OK') + + rv = self.app.get('/tb/recent/foo') + self.assertEqual(rv.status_code, 404) + + rv = self.app.post('/tb/recent', data=dict(views='1')) + self.assertEqual(rv.status_code, 200, 'POST with views==1 OK') + html = BeautifulSoup(rv.data.decode('utf-8'), 'html.parser') + tb_a_tags = html.find_all('a', 'mathjax', rel='external nofollow') + self.assertEquals(len(tb_a_tags), 1, + 'There should be exactly one trackback link') + + def test_stats_today(self): + """Test the /stats/today page.""" + rv = self.app.get('/stats/today') + self.assertEqual(rv.status_code, 200) + rv = self.app.get('/stats/today?date=20190102') + self.assertEqual(rv.status_code, 200) + html = BeautifulSoup(rv.data.decode('utf-8'), 'html.parser') + + csv_dl_elmt = html.find('a', {'href': '/stats/get_hourly?date=20190102'}) + self.assertIsNotNone(csv_dl_elmt, + 'csv download link exists') + + def test_stats_monthly_downloads(self): + """Test the /stats/monthly_downloads page.""" + rv = self.app.get('/stats/monthly_downloads') + self.assertEqual(rv.status_code, 200) + html = BeautifulSoup(rv.data.decode('utf-8'), 'html.parser') + + csv_dl_elmt = html.find('a', {'href': '/stats/get_monthly_downloads'}) + self.assertIsNotNone(csv_dl_elmt, + 'csv download link exists') + + def test_stats_monthly_submissions(self): + """Test the /stats/monthly_submissions page.""" + rv = self.app.get('/stats/monthly_submissions') + self.assertEqual(rv.status_code, 200) + html = BeautifulSoup(rv.data.decode('utf-8'), 'html.parser') + + csv_dl_elmt = html.find('a', {'href': '/stats/get_monthly_submissions'}) + self.assertIsNotNone(csv_dl_elmt, + 'csv download link exists') + + def test_abs_without_license_field(self): f1 = ABS_FILES + '/ftp/arxiv/papers/0704/0704.0001.abs' m = AbsMetaSession.parse_abs_file(filename=f1) @@ -148,7 +247,8 @@ def test_requested_version(self): self.assertIsNotNone(pdf_dl_elmt, 'pdf download link without version affix exists') pdf_dl_elmt = html.find('a', {'href': '/pdf/physics/9707012v'}) - self.assertIsNone(pdf_dl_elmt, 'pdf download link with version affix does not exist') + self.assertIsNone( + pdf_dl_elmt, 'pdf download link with version affix does not exist') rv = self.app.get('/abs/physics/9707012v4') self.assertEqual(rv.status_code, 200) @@ -200,7 +300,6 @@ def test_1501_9999(self): 'href="ftp://ftp.arxiv.org/cheese.txt"' in rv.data.decode('utf-8'), "FTP URLs should be turned into links ARXIVNG-1242") - def test_160408245(self): """Test linking in 1604.08245.""" id = '1604.08245' @@ -234,62 +333,65 @@ def test_arxivng_1246(self): def test_authors_and_arxivId_in_title(self): id = '1501.99999' - rv = self.app.get('/abs/'+id) + rv = self.app.get('/abs/' + id) self.assertEqual(rv.status_code, 200) html = BeautifulSoup(rv.data.decode('utf-8'), 'html.parser') - title_elmt = html.find('h1','title') - self.assertTrue(title_elmt,'Should title element') + title_elmt = html.find('h1', 'title') + self.assertTrue(title_elmt, 'Should title element') ida = title_elmt.find('a') self.assertTrue(ida, 'Should be tag in title') - self.assertIsNotNone(ida['href'],' tag in title should have href') - self.assertEqual(ida['href'], '/abs/1501.99998') + self.assertIsNotNone(ida['href'], ' tag in title should have href') + self.assertEqual(ida['href'], 'https://arxiv.org/abs/1501.99998') + self.assertEqual(ida.text, '1501.99998') - au_a_tags = html.find('div','authors').find_all('a') - self.assertGreater(len(au_a_tags), 1, 'Should be some a tags for authors') + au_a_tags = html.find('div', 'authors').find_all('a') + self.assertGreater(len(au_a_tags), 1, + 'Should be some a tags for authors') self.assertNotIn('query=The', au_a_tags[0]['href'], 'Collaboration author query should not have "The"') self.assertEqual(au_a_tags[0].text, 'SuperSuper Collaboration') - def test_long_author_colab(self): id = '1501.05201' - rv = self.app.get('/abs/'+id) + rv = self.app.get('/abs/' + id) self.assertEqual(rv.status_code, 200) html = BeautifulSoup(rv.data.decode('utf-8'), 'html.parser') - auths_elmt = html.find('div','authors') - self.assertTrue(auths_elmt,'Should authors div element') + auths_elmt = html.find('div', 'authors') + self.assertTrue(auths_elmt, 'Should authors div element') a_tags = auths_elmt.find_all('a') - self.assertEqual(len(a_tags), 2, 'Should be two tags in authors div') + self.assertEqual( + len(a_tags), 2, 'Should be two tags in authors div') - colab=a_tags[1] + colab = a_tags[1] - self.assertIsNotNone(colab['href'],' tag in title should have href') - self.assertEqual(colab['href'], 'https://arxiv.org/search/physics?searchtype=author&query=ILL%2FESS%2FLiU+collaboration') - self.assertEqual(colab.text, 'ILL/ESS/LiU collaboration for the development of the B10 detector technology in the framework of the CRISP project') + self.assertIsNotNone( + colab['href'], ' tag in title should have href') + self.assertEqual( + colab['href'], 'https://arxiv.org/search/physics?searchtype=author&query=ILL%2FESS%2FLiU+collaboration') + self.assertEqual( + colab.text, 'ILL/ESS/LiU collaboration for the development of the B10 detector technology in the framework of the CRISP project') - - @unittest.skip("In current implementation, conflicts with comma test below.") + @unittest.skip("In current implementation, conflicts with comma test below.") def test_space_in_author_list(self): id = '1210.8438' - rv = self.app.get('/abs/'+id) + rv = self.app.get('/abs/' + id) self.assertEqual(rv.status_code, 200) html = BeautifulSoup(rv.data.decode('utf-8'), 'html.parser') - auths_elmt = html.find('div','authors') - self.assertTrue(auths_elmt,'Should authors div element') + auths_elmt = html.find('div', 'authors') + self.assertTrue(auths_elmt, 'Should authors div element') self.assertIn('Zhe (Rita) Liang,', auths_elmt.text, 'Should be a space after (Rita)') - def test_comma_in_author_list(self): id = '0704.0155' - rv = self.app.get('/abs/'+id) + rv = self.app.get('/abs/' + id) self.assertEqual(rv.status_code, 200) html = BeautifulSoup(rv.data.decode('utf-8'), 'html.parser') auths_elmt = html.find('div', 'authors') @@ -313,5 +415,34 @@ def test_psi_in_abs(self): self.assertIn('The phase difference $\phi$, between the superconducting', abs_elmt.text, "Expecting uncoverted $\phi$ in html abstract.") - + def test_year(self): + rv = self.app.get('/year/astro-ph/09') + self.assertEqual(rv.status_code, 200) + + rv = self.app.get('/year/astro-ph/') + self.assertEqual( rv.status_code, 200) + + rv = self.app.get('/year/astro-ph') + self.assertEqual( rv.status_code, 200) + + rv = self.app.get('/year/astro-ph/09/') + self.assertEqual(rv.status_code, 200) + + rv = self.app.get('/year') + self.assertEqual( rv.status_code, 404) + + rv = self.app.get('/year/astro-ph/9999') + self.assertEqual(rv.status_code, 307, 'Future year should cause temporary redirect') + + rv = self.app.get('/year/fakearchive/01') + self.assertNotEqual(rv.status_code, 200) + self.assertLess( rv.status_code, 500, 'should not cause a 5XX') + + rv = self.app.get('/year/002/0000') + self.assertLess( rv.status_code, 500, 'should not cause a 5XX') + + rv = self.app.get('/year/astro-py/9223372036854775808') + self.assertLess( rv.status_code, 500, 'should not cause a 5XX') + + diff --git a/tests/test_categories.py b/tests/test_categories.py index 00a588420..a05f0b99f 100644 --- a/tests/test_categories.py +++ b/tests/test_categories.py @@ -3,7 +3,7 @@ from tests.test_abs_parser import ABS_FILES from browse.services.document.metadata import AbsMetaSession - +from browse.domain.category import Category class CategoriesTest(unittest.TestCase): @@ -15,7 +15,7 @@ def test_categories_for_0906_3421v1_cats(self): ['Statistical Mechanics (cond-mat.stat-mech)', 'Mathematical Physics (math-ph)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0906_3421_cats(self): @@ -23,7 +23,7 @@ def test_categories_for_0906_3421_cats(self): ['Statistical Mechanics (cond-mat.stat-mech)', 'Mathematical Physics (math-ph)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0129_cats(self): @@ -31,7 +31,7 @@ def test_categories_for_0704_0129_cats(self): ['Mathematical Physics (math-ph)', 'Analysis of PDEs (math.AP)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0914_cats(self): @@ -39,7 +39,7 @@ def test_categories_for_0704_0914_cats(self): ['Mathematical Physics (math-ph)', 'Optics (physics.optics)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0582_cats(self): @@ -47,7 +47,7 @@ def test_categories_for_0704_0582_cats(self): 'Probability (math.PR)', ['Mathematical Physics (math-ph)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0495_cats(self): @@ -55,7 +55,7 @@ def test_categories_for_0704_0495_cats(self): ['Mathematical Physics (math-ph)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0681_cats(self): @@ -63,7 +63,7 @@ def test_categories_for_0704_0681_cats(self): ['Statistical Mechanics (cond-mat.stat-mech)', 'Instrumentation and Detectors (physics.ins-det)', 'Optics (physics.optics)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0761_cats(self): @@ -71,7 +71,7 @@ def test_categories_for_0704_0761_cats(self): ['Soft Condensed Matter (cond-mat.soft)', 'Chemical Physics (physics.chem-ph)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0528_cats(self): @@ -79,7 +79,7 @@ def test_categories_for_0704_0528_cats(self): ['Information Theory (cs.IT)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0869_cats(self): @@ -87,7 +87,7 @@ def test_categories_for_0704_0869_cats(self): ['Mathematical Physics (math-ph)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0796_cats(self): @@ -95,7 +95,7 @@ def test_categories_for_0704_0796_cats(self): ['Astrophysics (astro-ph)', 'Statistical Mechanics (cond-mat.stat-mech)', 'High Energy Physics - Phenomenology (hep-ph)', 'Mathematical Physics (math-ph)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0046_cats(self): @@ -103,7 +103,7 @@ def test_categories_for_0704_0046_cats(self): 'Quantum Physics (quant-ph)', ['Information Theory (cs.IT)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0976_cats(self): @@ -112,7 +112,7 @@ def test_categories_for_0704_0976_cats(self): 'Chaotic Dynamics (nlin.CD)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0041_cats(self): @@ -120,7 +120,7 @@ def test_categories_for_0704_0041_cats(self): ['Mathematical Physics (math-ph)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0918_cats(self): @@ -128,7 +128,7 @@ def test_categories_for_0704_0918_cats(self): 'Algebraic Geometry (math.AG)', ['Statistics Theory (math.ST)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0123_cats(self): @@ -136,7 +136,7 @@ def test_categories_for_0704_0123_cats(self): ['Other Condensed Matter (cond-mat.other)', 'Optics (physics.optics)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0520_cats(self): @@ -145,7 +145,7 @@ def test_categories_for_0704_0520_cats(self): 'Atomic Physics (physics.atom-ph)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0084_cats(self): @@ -154,7 +154,7 @@ def test_categories_for_0704_0084_cats(self): 'Fluid Dynamics (physics.flu-dyn)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0588_cats(self): @@ -162,7 +162,7 @@ def test_categories_for_0704_0588_cats(self): 'Probability (math.PR)', ['Statistics Theory (math.ST)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) def test_categories_for_0704_0687_cats(self): @@ -170,5 +170,17 @@ def test_categories_for_0704_0687_cats(self): ['Mathematical Physics (math-ph)']) doc = self.absService.get_abs(id) self.assertIsNotNone(doc) - assert_that(doc.primary_category.display_str(), equal_to(primary)) + assert_that(doc.primary_category.display, equal_to(primary)) assert_that(doc.display_secondaries(), equal_to(secondaries)) + + + def test_non_tax_cats(self): + cat = Category('nontx.AL') + assert_that(cat.display , equal_to('nontx.AL')) + + # TODO: confirm actual desired behavior + # cat = Category('math.XX') + # assert_that(cat.display , equal_to('Mathematics (math.XX)')) + + cat = Category('notaxArc') + assert_that(cat.display , equal_to('notaxArc')) diff --git a/tests/test_click_through.py b/tests/test_click_through.py deleted file mode 100644 index a6a6fd3f5..000000000 --- a/tests/test_click_through.py +++ /dev/null @@ -1,15 +0,0 @@ -# from unittest import TestCase -import unittest - -from hypothesis import given -from hypothesis.strategies import text - -from browse.util.clickthrough import create_hash, is_hash_valid - - -class TestClickthrough(unittest.TestCase): - - @given(text(), text()) - def test_clickthrough(self, s, v): - h = create_hash(s, v) - self.assertTrue(is_hash_valid(s, v, h)) diff --git a/tests/test_cookies.py b/tests/test_cookies.py new file mode 100644 index 000000000..db1b8b22b --- /dev/null +++ b/tests/test_cookies.py @@ -0,0 +1,37 @@ +"""test cookies""" +import unittest + +#from bs4 import BeautifulSoup + +from app import app + + +class CookiesPageTest(unittest.TestCase): + + def setUp(self): + app.testing = True + app.config['APPLICATION_ROOT'] = '' + self.app = app.test_client() + + + def test_cookies_with_no_params(self): + """Test the cookies page.""" + rv = self.app.get('/cookies') + self.assertEqual(rv.status_code, 200) + html = rv.data.decode('utf-8') + self.assertIn('Select preferred download format', html) + self.assertIn('show additional debugging information', html, 'should have SHOW debugging link') + + def test_cookies_with_debug(self): + """Test the cookies page.""" + rv = self.app.get('/cookies?debug=1') + self.assertEqual(rv.status_code, 200) + html = rv.data.decode('utf-8') + self.assertIn('Select preferred download format', html) + self.assertIn('hide debugging information', html, 'should have HIDE debugging link') + + def test_post_to_cookies(self): + rv = self.app.post('/cookies/set?debug=1', data={'ps':'pdf'}) + self.assertEqual(rv.status_code, 302) + cookies = map(lambda kv: kv[1], filter(lambda kv : kv[0]=='Set-Cookie', rv.headers.items())) + self.assertIn('xxx-ps-defaults=pdf; Path=/', cookies) diff --git a/tests/test_database_service.py b/tests/test_database_service.py index 025608bda..f6d72e62f 100644 --- a/tests/test_database_service.py +++ b/tests/test_database_service.py @@ -4,9 +4,11 @@ from unittest import mock, TestCase from unittest.mock import Mock, patch from sqlalchemy.exc import SQLAlchemyError - from sqlalchemy.orm.exc import NoResultFound +from browse.services.database.models import TrackbackPing +from browse.domain.identifier import Identifier + from tests import grep_f_count, execute_sql_files, path_of_for_test @@ -82,6 +84,10 @@ def setUpClass(cls) -> None: execute_sql_files(sql_files, database.db.engine) database.db.session.commit() + """Disable logging to avoid messy output during testing""" + import logging + logging.disable(logging.WARNING) + def test_get_institution_returns_a_label(self) -> None: """If IP address matches an institution, a label is returned.""" label = TestBrowseDatabaseService.database_service.get_institution( @@ -123,7 +129,8 @@ def test_get_institution_returns_a_label(self) -> None: def test_all_trackback_pings(self) -> None: """Test if all trackback pings are counted.""" - doc_sql_file = path_of_for_test('data/db/sql/arXiv_trackback_pings.sql') + doc_sql_file = path_of_for_test( + 'data/db/sql/arXiv_trackback_pings.sql') count_from_file = grep_f_count( doc_sql_file, @@ -151,14 +158,14 @@ def test_all_trackback_pings(self) -> None: ) def test_trackback_pings(self) -> None: - """Test if trackback pings for specific paper are counted.""" + """Test if trackback pings for a specific paper are counted.""" test_paper_id = '0808.4142' count_from_db: int = TestBrowseDatabaseService.database_service\ .count_trackback_pings(test_paper_id) count_from_db_list: int = TestBrowseDatabaseService.database_service\ - .get_trackback_pings(test_paper_id).__len__() + .get_paper_trackback_pings(test_paper_id).__len__() self.assertEqual( - count_from_db, 8, + count_from_db, 9, f'Correct count of pings returned for paper {test_paper_id}' ) self.assertEqual( @@ -166,6 +173,21 @@ def test_trackback_pings(self) -> None: f'Correct count of pings returned for paper {test_paper_id}' ) + def test_recent_trackback_pings(self) -> None: + """Test if recent trackbacks can be retrieved.""" + tbs: List = TestBrowseDatabaseService.database_service.\ + get_recent_trackback_pings(max_trackbacks=-1) + self.assertEqual(len(tbs), 0, 'List should be empty') + tbs: List = TestBrowseDatabaseService.database_service.\ + get_recent_trackback_pings(max_trackbacks=25) + self.assertGreater(len(tbs), 0, 'List should be nonempty') + for tb in tbs: + self.assertIsInstance(tb[0], TrackbackPing) + self.assertIsInstance(tb[1], str) + self.assertIsInstance(Identifier( + arxiv_id=tb[1]), Identifier, 'Value looks like an Identifier') + self.assertIsInstance(tb[2], str) + def test_sciencewise_ping(self) -> None: """Test whether paper with version suffix has a ScienceWISE ping.""" test_paper_id_v = '1605.09669v2' @@ -209,15 +231,23 @@ def test_get_dblp_authors(self) -> None: TestBrowseDatabaseService.database_service.get_dblp_authors( test_paper_id), []) + def test_get_document_count(self) -> None: + """Test document count function.""" + self.assertGreater( + TestBrowseDatabaseService.database_service.get_document_count(), + 0, + 'There is at least one document in the DB.' + ) + @mock.patch('browse.services.database.models.db.session.query') - def test_error_conditions(self, mock_query)->None: + def test_error_conditions(self, mock_query) -> None: mock_query.side_effect = NoResultFound self.assertEqual( TestBrowseDatabaseService.database_service.get_institution('10.0.0.1'), None) self.assertEqual([], - TestBrowseDatabaseService.database_service.get_all_trackback_pings()) + TestBrowseDatabaseService.database_service.get_all_trackback_pings()) self.assertListEqual( - TestBrowseDatabaseService.database_service.get_trackback_pings('0704.0361'), []) + TestBrowseDatabaseService.database_service.get_paper_trackback_pings('0704.0361'), []) self.assertEqual( TestBrowseDatabaseService.database_service.count_trackback_pings('0704.0361'), 0) self.assertEqual( @@ -231,12 +261,18 @@ def test_error_conditions(self, mock_query)->None: mock_query.side_effect = SQLAlchemyError self.assertRaises(SQLAlchemyError, TestBrowseDatabaseService.database_service.get_institution, '10.0.0.1') - self.assertRaises(SQLAlchemyError, TestBrowseDatabaseService.database_service.get_all_trackback_pings) - self.assertRaises(SQLAlchemyError, TestBrowseDatabaseService.database_service.get_trackback_pings, 'paperx') - self.assertRaises(SQLAlchemyError, TestBrowseDatabaseService.database_service.count_all_trackback_pings) - self.assertRaises(SQLAlchemyError, TestBrowseDatabaseService.database_service.has_sciencewise_ping, 'px') - self.assertRaises(SQLAlchemyError, TestBrowseDatabaseService.database_service.get_dblp_listing_path, 'px') - self.assertRaises(SQLAlchemyError, TestBrowseDatabaseService.database_service.get_dblp_authors,'authx') + self.assertRaises( + SQLAlchemyError, TestBrowseDatabaseService.database_service.get_all_trackback_pings) + self.assertRaises( + SQLAlchemyError, TestBrowseDatabaseService.database_service.get_paper_trackback_pings, 'paperx') + self.assertRaises( + SQLAlchemyError, TestBrowseDatabaseService.database_service.count_all_trackback_pings) + self.assertRaises( + SQLAlchemyError, TestBrowseDatabaseService.database_service.has_sciencewise_ping, 'px') + self.assertRaises( + SQLAlchemyError, TestBrowseDatabaseService.database_service.get_dblp_listing_path, 'px') + self.assertRaises( + SQLAlchemyError, TestBrowseDatabaseService.database_service.get_dblp_authors, 'authx') @classmethod def tearDownClass(cls) -> None: diff --git a/tests/test_doi.py b/tests/test_doi.py index 124106d76..18772845b 100644 --- a/tests/test_doi.py +++ b/tests/test_doi.py @@ -13,6 +13,7 @@ from app import app +@unittest.skip('We will move this test and any required test data to arxiv-base in the near future') class DoiTest(unittest.TestCase): def setUp(self): diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py index 54186696b..2ddc2e6d0 100644 --- a/tests/test_exceptions.py +++ b/tests/test_exceptions.py @@ -19,6 +19,8 @@ def setUp(self): wlog.disabled = True self.app = create_web_app() + self.app.testing = True + self.app.config['APPLICATION_ROOT'] = '' self.client = self.app.test_client() def test_404(self): @@ -54,7 +56,8 @@ def test_500(self, mock_abs): """Disable logging to avoid messy output during testing""" self.app.logger.disabled = True - response = self.client.get('/abs/1234.5678') - self.assertEqual(response.status_code, - status.HTTP_500_INTERNAL_SERVER_ERROR) - self.assertIn('text/html', response.content_type) + with self.assertRaises(AbsException): + response = self.client.get('/abs/1234.5678') + self.assertEqual(response.status_code, + status.HTTP_500_INTERNAL_SERVER_ERROR) + self.assertIn('text/html', response.content_type) diff --git a/tests/test_filters.py b/tests/test_filters.py index ee38e22e1..31c7d8e00 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -1,5 +1,4 @@ import unittest -from hamcrest import * from functools import partial from jinja2 import escape, Markup, Environment @@ -7,229 +6,217 @@ from flask import appcontext_pushed, url_for from app import app -from browse.filters import line_feed_to_br, tex_to_utf, entity_to_utf -from browse.util.id_patterns import do_dois_to_tags, do_dois_id_urls_to_tags,\ - do_id_to_tags +from arxiv.base.urls import links, urlizer, urlize +from arxiv.base.filters import abstract_lf_to_br, f_tex2utf +from browse.filters import entity_to_utf +@unittest.skip("These test features implemented in arxiv-base so move these tests to arxiv-base") +class Jinja_Custom_Filters_Test(unittest.TestCase): + """Browse jinja filter tests.""" -def _id_to_url(id: str): - return url_for('browse.abstract', arxiv_id=id) - -def arxiv_urlize(txt): - return do_dois_id_urls_to_tags(_id_to_url,None,txt) - -def doi_urls(fn, txt): - return do_dois_to_tags(fn, txt) - -def arxiv_id_urls(txt): - return do_id_to_tags(_id_to_url, txt) - -class Jinja_Custom_Fitlers_Test(unittest.TestCase): def test_with_jinja(self): - jenv = Environment(autoescape=True) - jenv.filters['doi_urls'] = partial(doi_urls, lambda x: x) - assert_that( - jenv.from_string( - '{{"something 10.1103/PhysRevD.76.013009 or other"|doi_urls}}' - ).render(), - equal_to( - 'something 10.1103/PhysRevD.76.013009 or other' - )) + """Basic urlize DOI filter test.""" + with app.app_context(): + jenv = Environment(autoescape=True) + jenv.filters['urlize'] = urlizer( + ['doi'] + ) + self.assertEqual( + jenv.from_string( + '{{"something 10.1103/PhysRevD.76.013009 or other"|urlize}}' + ).render(), + 'something <a class="link-https link-external" data-doi="10.1103/PhysRevD.76.013009" href="https://arxiv.org/ct?url=https%3A%2F%2Fdx.doi.org%2F10.1103%2FPhysRevD.76.013009&amp;v=d0670bbf" rel="external noopener nofollow">10.1103/PhysRevD.76.013009</a> or other' + ) def test_with_jinja_escapes(self): - jenv = Environment(autoescape=True) - jenv.filters['doi_urls'] = partial(doi_urls, lambda x: x) - jenv.filters['arxiv_urlize'] = arxiv_urlize - assert_that( - jenv.from_string( - '{{"something 10.1103/PhysRevD.76.013009 or other"|doi_urls}}' - ).render(), - equal_to( - 'something 10.1103/PhysRevD.76.013009 or other' - )) - - assert_that( - jenv.from_string( - '{{" something 10.1103/PhysRevD.76.013009"|arxiv_urlize}}' - ).render(), - equal_to( - '<script>bad junk</script> something 10.1103/PhysRevD.76.013009' - )) + """Test the tex2utf filter with jinja escapes.""" + with app.app_context(): + jenv = Environment(autoescape=True) + jenv.filters['urlize'] = urlizer( + ['arxiv_id', 'doi'] + ) + + # TODO: urlize doesn't seem to return a Markup object? + self.assertEqual( + jenv.from_string( + '{{"something 10.1103/PhysRevD.76.013009 or other"|urlize}}' + ).render(), + 'something <a class="link-https link-external" data-doi="10.1103/PhysRevD.76.013009" href="https://arxiv.org/ct?url=https%3A%2F%2Fdx.doi.org%2F10.1103%2FPhysRevD.76.013009&amp;v=d0670bbf" rel="external noopener nofollow">10.1103/PhysRevD.76.013009</a> or other' + ) + + self.assertEqual( + jenv.from_string( + '{{" something 10.1103/PhysRevD.76.013009"|urlize}}' + ).render(), + '<script>bad junk</script> something <a class="link-https link-external" data-doi="10.1103/PhysRevD.76.013009" href="https://arxiv.org/ct?url=https%3A%2F%2Fdx.doi.org%2F10.1103%2FPhysRevD.76.013009&amp;v=d0670bbf" rel="external noopener nofollow">10.1103/PhysRevD.76.013009</a>' + ) def test_doi_filter(self): - doi_fn = partial(doi_urls, lambda x: x) - - s = '' - self.assertEqual(doi_fn(s), s) - - s = 'some test string 23$6#$5<>&456 http://google.com/notadoi' - assert_that(doi_fn(s), equal_to(escape(s))) - - doi = '10.1103/PhysRevD.76.013009' - doiurl = doi_fn(doi) - self.assertRegex(doiurl, r'^10.1103/PhysRevD.76.013009'))) - - s = f'something something {doi} endthing' - doiurl = doi_fn(s) - self.assertRegex(doiurl, r'10.1103/PhysRevA.99.013009' - ' 10.1103/PhysRevZ.44.023009' - ' 10.1103/PhysRevX.90.012309' - ' 10.1103/BioRevX.44.123456' - ))) - - txt = '' - assert_that( - doi_fn(f'{doi} {txt}'), - equal_to( - Markup(f'10.1103/PhysRevD.76.013009 {escape(txt)}') - )) + """Test the urlizer DOI filter.""" + with app.app_context(): + s = 'some test string 23$6#$5<>&456 http://google.com/notadoi' + urlize_dois = urlizer( + ['doi'] + ) + self.assertEqual(urlize_dois(s), str(escape(s))) + + doi = '10.1103/PhysRevD.76.013009' + doiurl = urlize_dois(doi) + self.assertRegex(doiurl, r'^10.1103/PhysRevD.76.013009')) + ) + + s = f'something something {doi} endthing' + doiurl = urlize_dois(s) + self.assertRegex( + doiurl, r'10.1103/PhysRevA.99.013009 ' + '10.1103/PhysRevZ.44.023009 ' + '10.1103/PhysRevX.90.012309 ' + '10.1103/BioRevX.44.123456' + ) + ) + ) + + txt = '' + self.assertEqual( + urlize_dois(f'{doi} {txt}'), + str(Markup(f'10.1103/PhysRevD.76.013009 ')) + ) def test_arxiv_id_urls_basic(self): - h = 'sosmooth.org' # Totally bogus setup for testing, at least url_for returns something + """Test basic urlize for arXiv identifiers.""" + # a server name is needed for url_for to return something + h = 'arxiv.org' app.config['SERVER_NAME'] = h with app.app_context(): - assert_that(arxiv_id_urls(''), equal_to('')) + self.assertEqual(urlize('', ['arxiv_id']), '') s = 'some text 134#%$$%&^^%*^&(()*_)_<>?:;[}}' - assert_that( - arxiv_id_urls(s), equal_to(escape(s)), - 'filers should return marked up text, which means its escaped') - assert_that( - arxiv_id_urls('hep-th/9901001'), - equal_to( - f'hep-th/9901001', - )) - assert_that( - arxiv_id_urls('hep-th/9901001 hep-th/9901002'), - equal_to( - f'hep-th/9901001 hep-th/9901002' - )) + self.assertEqual(urlize(s), str(escape(s)), + 'filters should return escaped text') + self.assertEqual( + urlize('hep-th/9901001'), + f'hep-th/9901001', + ) + self.assertEqual( + urlize('hep-th/9901001 hep-th/9901002'), + f'hep-th/9901001 hep-th/9901002' + ) def test_arxiv_id_urls_3(self): + """Test more complex cases of urlize for arXiv identifiers.""" h = 'sosmooth.org' app.config['SERVER_NAME'] = h with app.app_context(): - assert_that( - arxiv_id_urls('hep-th/9901002'), - equal_to( - f'hep-th/9901002', - )) - assert_that( - arxiv_id_urls('hep-th/9901002\n'), - equal_to( - f'hep-th/9901002\n', - ), + self.assertEqual( + urlize('hep-th/9901002'), + f'hep-th/9901002', ) - assert_that( - arxiv_id_urls( - 'arXiv:dg-ga/9401001 hep-th/9901001 hep-th/9901002'), - equal_to( - f'arXiv:dg-ga/9401001 hep-th/9901001 hep-th/9901002', - ), + self.assertEqual( + urlize('hep-th/9901002\n'), + f'hep-th/9901002\n' + ) + self.assertEqual( + urlize('arXiv:dg-ga/9401001 hep-th/9901001 hep-th/9901002'), + f'arXiv:dg-ga/9401001 hep-th/9901001 hep-th/9901002' ) def test_arxiv_id_urls_punct(self): + """Test cases of of urlize for arXiv identifiers with punctuation.""" h = 'sosmooth.org' app.config['SERVER_NAME'] = h with app.app_context(): - assert_that( - arxiv_id_urls('hep-th/9901002.'), - equal_to( - f'hep-th/9901002.', - ), 'followed by period') - assert_that( - arxiv_id_urls('0702.0003.'), - equal_to( - f'0702.0003.', ), + self.assertEqual( + urlize('hep-th/9901002.'), + f'hep-th/9901002.', + 'followed by period') + self.assertEqual( + urlize('0702.0003.'), + f'0702.0003.', 'followed by period') - assert_that( - arxiv_id_urls('hep-th/9901001,hep-th/9901002'), - equal_to( - f'hep-th/9901001,hep-th/9901002' - ), 'filter_urls_ids_escape (ID linking) 3/7') - assert_that( - arxiv_id_urls('0702.0003, something'), - equal_to( - f'0702.0003, something', - ), 'followed by comma') - assert_that( - arxiv_id_urls('(0702.0003) something'), - equal_to( - f'(0702.0003) something', - ), 'in parens') + self.assertEqual( + urlize('hep-th/9901001,hep-th/9901002'), + f'hep-th/9901001,hep-th/9901002', + 'filter_urls_ids_escape (ID linking) 3/7') + self.assertEqual( + urlize('0702.0003, something'), + f'0702.0003, something', + 'followed by comma') + self.assertEqual( + urlize('(0702.0003) something'), + f'(0702.0003) something', + 'in parens') def test_arxiv_id_urls_more(self): + """Test urlize for arXiv identifiers that have mixed formatting.""" h = 'sosmooth.org' app.config['SERVER_NAME'] = h with app.app_context(): self.assertEqual( - arxiv_id_urls('arXiv:dg-ga/9401001 hep-th/9901001 0704.0001'), - f'arXiv:dg-ga/9401001 hep-th/9901001 0704.0001', - 'filter_urls_ids_escape (ID linking) 5/7') + urlize('arXiv:dg-ga/9401001 hep-th/9901001 0704.0001'), + f'arXiv:dg-ga/9401001 hep-th/9901001 0704.0001', + 'urlize (ID linking) 5/7') def test_arxiv_id_v(self): + """Test urlize for arXiv identifers with version affix.""" h = 'sosmooth.org' app.config['SERVER_NAME'] = h with app.app_context(): - assert_that( - arxiv_id_urls( - 'arXiv:dg-ga/9401001v12 hep-th/9901001v2 0704.0001v1'), - equal_to( - f'arXiv:dg-ga/9401001v12 hep-th/9901001v2 0704.0001v1' - ), 'arxiv ids with version numbers') + self.assertEqual( + urlize('arXiv:dg-ga/9401001v12 hep-th/9901001v2 0704.0001v1'), + f'arXiv:dg-ga/9401001v12 hep-th/9901001v2 0704.0001v1', + 'arxiv ids with version numbers') + @unittest.skip("TODO: confirm actual desired behavior") def test_vixra(self): + """Test urlize for identifiers prefixed by viXra.""" h = 'sosmooth.org' app.config['SERVER_NAME'] = h with app.app_context(): - assert_that( - arxiv_id_urls('viXra:0704.0001 viXra:1003.0123'), - equal_to('viXra:0704.0001 viXra:1003.0123')) + self.assertEqual(urlize('viXra:0704.0001 viXra:1003.0123'), + 'viXra:0704.0001 viXra:1003.0123') # this is what was expected in legacy, but it doesn't seem right: - # assert_that( - # arxiv_id_urls('vixra:0704.0001'), - # equal_to(f'vixra:0704.0001')) + # self.assertEqual( + # urlize('vixra:0704.0001'), + # f'vixra:0704.0001') def test_arxiv_id_urls_escaping(self): + """Test proper escaping when urlize applied.""" h = 'sosmooth.org' app.config['SERVER_NAME'] = h with app.app_context(): ax_id = 'hep-th/9901002' user_entered_txt = '
    div should be escaped
    ' - ex_txt = escape(user_entered_txt).__html__() - assert_that( - arxiv_id_urls(ax_id + user_entered_txt), - equal_to( - f'hep-th/9901002{ex_txt}' - ), + ex_txt = Markup(user_entered_txt) + self.assertEqual( + urlize(ax_id + user_entered_txt), + f'hep-th/9901002{ex_txt}', 'Dealing with user entered text with html that should be escaped for safety' ) jinja_escaped_txt = Markup( '
    div should already be escaped by jinja2
    ') - assert_that( - arxiv_id_urls(ax_id + jinja_escaped_txt), - equal_to( - f'hep-th/9901002{jinja_escaped_txt}' - ), 'Dealing with text that has been escaped by Jinja2 already') + self.assertEqual( + urlize(ax_id + jinja_escaped_txt), + f'hep-th/9901002{jinja_escaped_txt}', + 'Dealing with text that has been escaped by Jinja2 already') def test_arxiv_id_jinja_escapes(self): h = 'sosmooth.org' @@ -237,224 +224,209 @@ def test_arxiv_id_jinja_escapes(self): with app.app_context(): jenv = Environment(autoescape=True) - jenv.filters['arxiv_id_urls'] = arxiv_id_urls - jenv.filters['arxiv_urlize'] = arxiv_urlize + jenv.filters['urlize'] = jenv.filters['urlize'] = urlizer( + ['arxiv_id', 'doi', 'url'] + ) - assert_that( + self.assertEqual( jenv.from_string( - '{{"something hep-th/9901002 or other"|arxiv_id_urls}}'). + '{{"something hep-th/9901002 or other"|urlize|safe}}'). render(), - equal_to( - f'something hep-th/9901002 or other' - )) + f'something hep-th/9901002 or other' + ) - assert_that( + self.assertEqual( jenv.from_string( - '{{" something 10.1103/PhysRevD.76.013009"|arxiv_urlize}}' + '{{" something 10.1103/PhysRevD.76.013009"|urlize|safe}}' ).render(), - equal_to( - '<script>bad junk</script> something 10.1103/PhysRevD.76.013009' - )) + ' something 10.1103/PhysRevD.76.013009' + ) - assert_that( + self.assertEqual( jenv.from_string( '{{" http://google.com bla bla ' - 'hep-th/9901002 bla"|arxiv_urlize}}'). + 'hep-th/9901002 bla"|urlize|safe}}'). render(), - equal_to( - '<script>bad junk</script> ' - 'this http URL bla bla ' - f'hep-th/9901002 bla' - ), 'should not double escape') + ' ' + 'this http URL bla bla ' + f'hep-th/9901002 bla', + 'should not double escape') def test_line_break(self): - assert_that( - line_feed_to_br('blal\n bla'), equal_to('blal\n
    bla')) + """Test the abstract lf to br tag filter.""" + self.assertEqual(abstract_lf_to_br('blal\n bla'), 'blal\n
    bla') - assert_that( - line_feed_to_br('\nblal\n bla'), equal_to('\nblal\n
    bla')) + self.assertEqual(abstract_lf_to_br( + '\nblal\n bla'), '\nblal\n
    bla') - assert_that( - line_feed_to_br('\n blal\n bla'), equal_to('\n blal\n
    bla'), - 'need to not do
    on first line') - assert_that( - line_feed_to_br('blal\n\nbla'), equal_to('blal\nbla'), - 'skip blank lines') + self.assertEqual(abstract_lf_to_br('\n blal\n bla'), + '\n blal\n
    bla', + 'need to not do
    on first line') + self.assertEqual(abstract_lf_to_br('blal\n\nbla'), 'blal\nbla', + 'skip blank lines') def test_line_break_jinja(self): + """Test the abstract lf to br tag filter with urlize on arXiv IDs.""" h = 'sosmooth.org' app.config['SERVER_NAME'] = h with app.app_context(): jenv = Environment(autoescape=True) - jenv.filters['arxiv_urlize'] = arxiv_urlize - jenv.filters['line_break'] = line_feed_to_br + jenv.filters['urlize'] = urlize + jenv.filters['line_break'] = abstract_lf_to_br - assert_that( + self.assertEqual( jenv.from_string( '{{" http://google.com something or \n' '\n' 'no double \\n' ' should have br\n' - 'hep-th/9901002 other"|line_break|arxiv_urlize}}' + 'hep-th/9901002 other"|line_break|urlize|safe}}' ).render(), - equal_to( - '<script>bad junk</script> ' - 'this http URL' - ' something or \n' - 'no double \n' - '
    should have br\n' - 'hep-th/9901002 other' - ), + '<script>bad junk</script> ' + 'this http URL' + ' something or \n' + 'no double \n' + '
    should have br\n' + 'hep-th/9901002 other', 'line_break and arxiv_id_urls should work together' ) - def test_tex_to_utf(self): + def test_tex2utf(self): + """Test the tex2utf filter.""" h = 'sosmooth.org' app.config['SERVER_NAME'] = h with app.app_context(): jenv = Environment(autoescape=True) - jenv.filters['arxiv_id_urls'] = arxiv_id_urls - jenv.filters['line_break'] = line_feed_to_br - jenv.filters['doi_urls'] = partial(doi_urls, lambda x: x) - jenv.filters['arxiv_urlize'] = arxiv_urlize - jenv.filters['tex_to_utf'] = tex_to_utf + jenv.filters['urlize'] = urlize + jenv.filters['line_break'] = abstract_lf_to_br + jenv.filters['tex2utf'] = f_tex2utf - assert_that( - jenv.from_string('{{""|tex_to_utf|arxiv_id_urls}}').render(), - equal_to('')) + self.assertEqual( + jenv.from_string('{{""|tex2utf|urlize|safe}}').render(), + '' + ) title = jenv.from_string( - '{{"Finite-Size and Finite-Temperature Effects in the Conformally Invariant O(N) Vector Model for 2 ARXIVNG-1227' + 'Finite-Size and Finite-Temperature Effects in the Conformally Invariant O(N) Vector Model for 2<d<4', + 'tex2utf and arxiv_id_urls should handle < and > ARXIVNG-1227' ) - assert_that(tex_to_utf('Lu\\\'i'), equal_to('Luí')) - assert_that(tex_to_utf(Markup('Lu\\\'i')), equal_to('Luí')) - assert_that(tex_to_utf(Markup(escape('Lu\\\'i'))), equal_to('Luí')) + self.assertEqual(f_tex2utf('Lu\\\'i'), 'Luí') + self.assertEqual(f_tex2utf(Markup('Lu\\\'i')), 'Luí') + self.assertEqual(f_tex2utf(Markup(escape('Lu\\\'i'))), 'Luí') def test_entity_to_utf(self): + """Test the entity to utf filter.""" h = 'sosmooth.org' app.config['SERVER_NAME'] = h with app.app_context(): jenv = Environment(autoescape=True) - jenv.filters['arxiv_id_urls'] = arxiv_id_urls - jenv.filters['line_break'] = line_feed_to_br - jenv.filters['doi_urls'] = partial(doi_urls, lambda x: x) - jenv.filters['arxiv_urlize'] = arxiv_urlize - jenv.filters['tex_to_utf'] = tex_to_utf jenv.filters['entity_to_utf'] = entity_to_utf - assert_that( + self.assertEqual( jenv.from_string('{{ "Martín"|entity_to_utf }}').render(), - equal_to('Martín'), 'entity_to_utf should work') - assert_that( + 'Martín', 'entity_to_utf should work') + self.assertEqual( jenv.from_string( '{{ ""|entity_to_utf }}').render(), - equal_to('<Martín>'), + '<Martín>', 'entity_to_utf should work even with < or >') def test_arxiv_urlize_no_email_links(self): + """Test to ensure email addresses are not turned into links.""" h = 'sosmooth.org' app.config['SERVER_NAME'] = h with app.app_context(): jenv = Environment(autoescape=True) - jenv.filters['arxiv_urlize'] = arxiv_urlize + jenv.filters['urlize'] = urlize - assert_that( + self.assertEqual( jenv.from_string( - '{{ "bob@example.com"|arxiv_urlize }}').render(), - equal_to('bob@example.com'), + '{{ "bob@example.com"|urlize|safe }}').render(), + 'bob@example.com', 'arxiv_urlize should not turn emails into links') - assert_that( + self.assertEqual( jenv.from_string( - '{{ ""|arxiv_urlize }}').render(), - equal_to('<bob@example.com>'), + '{{ ""|urlize|safe }}').render(), + '<bob@example.com>', 'arxiv_urlize should work even with < or >') def test_arxiv_urlize(self): - - - + """Multiple basic urlize tests.""" h = 'sosmooth.org' app.config['SERVER_NAME'] = h with app.app_context(): - - def do_arxiv_urlize(txt): - return arxiv_urlize(txt) - self.assertEqual( - do_arxiv_urlize('http://example.com/'), - 'this http URL', - 'do_arxiv_urlize (URL linking) 1/6') + urlize('http://example.com/'), + 'this http URL', + 'urlize (URL linking) 1/6') self.assertEqual( - do_arxiv_urlize('https://example.com/'), - 'this https URL', - 'do_arxiv_urlize (URL linking) 2/6') + urlize('https://example.com/'), + 'this https URL', + 'urlize (URL linking) 2/6') self.assertEqual( - do_arxiv_urlize('ftp://example.com/'), - 'this ftp URL', - 'do_arxiv_urlize (URL linking) 3/6') + urlize('ftp://example.com/'), + 'this ftp URL', + 'urlize (URL linking) 3/6') self.assertEqual( - do_arxiv_urlize('http://example.com/.hep-th/9901001'), - 'this http URL', - 'do_arxiv_urlize (URL linking) 4/6') + urlize('http://example.com/.hep-th/9901001'), + 'this http URL', + 'urlize (URL linking) 4/6') self.assertEqual( - do_arxiv_urlize( + urlize( 'http://projecteuclid.org/euclid.bj/1151525136' ), - 'this http URL', - 'do_arxiv_urlize (URL linking) 6/6') - assert_that( - do_arxiv_urlize(' Correction to Bernoulli (2006), 12, 551--570 http://projecteuclid.org/euclid.bj/1151525136'), - equal_to(Markup(' Correction to Bernoulli (2006), 12, 551--570 this http URL')), - 'do_arxiv_urlize (URL linking) 6/6') + 'this http URL', + 'urlize (URL linking) 6/6') + self.assertEqual( + urlize( + ' Correction to Bernoulli (2006), 12, 551--570 http://projecteuclid.org/euclid.bj/1151525136'), + Markup(' Correction to Bernoulli (2006), 12, 551--570 this http URL'), + 'urlize (URL linking) 6/6') # shouldn't match self.assertEqual( - do_arxiv_urlize('2448446.4710(5)'), '2448446.4710(5)', - 'do_arxiv_urlize (should not match) 1/9') + urlize('2448446.4710(5)'), '2448446.4710(5)', + 'urlize (should not match) 1/9') self.assertEqual( - do_arxiv_urlize('HJD=2450274.4156+/-0.0009'), + urlize('HJD=2450274.4156+/-0.0009'), 'HJD=2450274.4156+/-0.0009', - 'do_arxiv_urlize (should not match) 2/9') + 'urlize (should not match) 2/9') self.assertEqual( - do_arxiv_urlize('T_min[HJD]=49238.83662(14)+0.146352739(11)E.'), + urlize('T_min[HJD]=49238.83662(14)+0.146352739(11)E.'), 'T_min[HJD]=49238.83662(14)+0.146352739(11)E.', - 'do_arxiv_urlize (should not match) 3/9') + 'urlize (should not match) 3/9') self.assertEqual( - do_arxiv_urlize('Pspin=1008.3408s'), 'Pspin=1008.3408s', - 'do_arxiv_urlize (should not match) 4/9') + urlize('Pspin=1008.3408s'), 'Pspin=1008.3408s', + 'urlize (should not match) 4/9') self.assertEqual( - do_arxiv_urlize('2453527.87455^{+0.00085}_{-0.00091}'), + urlize('2453527.87455^{+0.00085}_{-0.00091}'), '2453527.87455^{+0.00085}_{-0.00091}', - 'do_arxiv_urlize (should not match) 5/9') - self.assertEqual( - do_arxiv_urlize('2451435.4353'), '2451435.4353', - 'do_arxiv_urlize (should not match) 6/9') - assert_that( - do_arxiv_urlize('cond-mat/97063007'), - equal_to('cond-mat/97063007'), - 'do_arxiv_urlize (should match) 7/9') - - assert_that( - do_arxiv_urlize('[http://onion.com/something-funny-about-arxiv-1234]'), - equal_to('[this http URL]')) - - assert_that( - do_arxiv_urlize('[http://onion.com/?q=something-funny-about-arxiv.1234]'), - equal_to('[this http URL]')) - - assert_that( - do_arxiv_urlize('http://onion.com/?q=something funny'), - equal_to('this http URL funny'), + 'urlize (should not match) 5/9') + self.assertEqual( + urlize('2451435.4353'), '2451435.4353', + 'urlize (should not match) 6/9') + self.assertEqual( + urlize('cond-mat/97063007'), + 'cond-mat/97063007', + 'urlize (should match) 7/9') + + self.assertEqual( + urlize('[http://onion.com/something-funny-about-arxiv-1234]'), + '[this http URL]') + + self.assertEqual( + urlize('[http://onion.com/?q=something-funny-about-arxiv.1234]'), + '[this http URL]') + + self.assertEqual( + urlize('http://onion.com/?q=something funny'), + 'this http URL funny', 'Spaces CANNOT be expected to be part of URLs') - - assert_that( - do_arxiv_urlize('"http://onion.com/something-funny-about-arxiv-1234"'), - equal_to(Markup('"this http URL"')), - 'Should handle URL surrounded by double quotes') + self.assertEqual( + urlize('"http://onion.com/something-funny-about-arxiv-1234"'), + '"this http URL"', + 'Should handle URL surrounded by double quotes') diff --git a/tests/test_id_patterns.py b/tests/test_id_patterns.py deleted file mode 100644 index b566c95d5..000000000 --- a/tests/test_id_patterns.py +++ /dev/null @@ -1,253 +0,0 @@ -import unittest -from hamcrest import * -import re - -from jinja2 import Markup, escape -from browse.util.id_patterns import _find_match, Matchable, dois_ids_and_urls, \ - _transform_token, do_dois_id_urls_to_tags - - -class Id_Patterns_Test(unittest.TestCase): - - def test_basic(self): - _find_match([], 'test') - - m = _find_match( [Matchable([], re.compile(r'test'))], 'test') - assert_that(m, is_not(None)) - - m0 = Matchable([], re.compile(r'test')) - m1 = Matchable([], re.compile(r'tests')) - - m = _find_match( [m0, m1], 'test') - assert_that(m, is_not(None)) - assert_that(m[1], equal_to(m0)) - - m = _find_match([m0, m1], 'tests') - assert_that(m, is_not(None)) - assert_that(m[1], equal_to(m0)) - - m = _find_match([m1, m0], 'tests') - assert_that(m, is_not(None)) - assert_that(m[1], equal_to(m1)) - - def test_arxiv_ids(self): - def find_match(txt): - return _find_match(dois_ids_and_urls, txt) - - assert_that(find_match('math/9901123'), is_not(None)) - assert_that(find_match('hep-ex/9901123'), is_not(None)) - assert_that(find_match('gr-qc/9901123'), is_not(None)) - - assert_that(find_match('1202.1234'), is_not(None)) - assert_that(find_match('1202.1234v1'), is_not(None)) - assert_that(find_match('1203.12345'), is_not(None)) - assert_that(find_match('1203.12345v1'), is_not(None)) - assert_that(find_match('1203.12345v12'), is_not(None)) - - # slightly odd but seen in comments - assert_that(find_match('hep-ph/1203.12345v12'), is_not(None)) - - def test_find_match(self): - def find_match(txt): - return _find_match(dois_ids_and_urls, txt) - - assert_that(find_match('junk'), equal_to(None)) - assert_that(find_match(''), equal_to(None)) - assert_that(find_match(' '), equal_to(None)) - - assert_that(find_match('doi:10.1002/0470841559.ch1'), is_not(None)) - assert_that(find_match('doi:10.1038/nphys1170'), is_not(None)) - - assert_that(find_match('http://arxiv.org'), is_not(None)) - assert_that(find_match('http://arxiv.org?something=1'), is_not(None)) - assert_that(find_match( - 'http://arxiv.org?something=1&anohter=2'), is_not(None)) - assert_that(find_match('"http://arxiv.org"'), is_not(None)) - - def test_transform_token(self): - # def doi_id_url_transform_token(tkn,fn): - # return doi_id_url_transform_token(fn, tkn) - - assert_that( - do_dois_id_urls_to_tags( None, None, ''), - equal_to('')) - - assert_that( - do_dois_id_urls_to_tags( None, None, - 'it is fine, chapter 234 see'), - equal_to(Markup(escape('it is fine, chapter 234 see')))) - - assert_that( - do_dois_id_urls_to_tags( None, None, 'http://arxiv.org'), - equal_to('this http URL')) - - assert_that( - do_dois_id_urls_to_tags( None, None, - 'Stuff in the front http://arxiv.org other stuff'), - equal_to('Stuff in the front this http URL other stuff')) - - assert_that( - do_dois_id_urls_to_tags( None, None, '.http://arxiv.org.'), - equal_to('.this http URL.')) - - assert_that( - do_dois_id_urls_to_tags( None, None, '"http://arxiv.org"'), - equal_to(Markup('"this http URL"'))) - - def test_urlize(self): - def do_arxiv_urlize(txt): - return do_dois_id_urls_to_tags(None,None, txt) - - assert_that( - do_arxiv_urlize('http://example.com/'), - equal_to('this http URL'), - 'do_arxiv_urlize (URL linking) 1/6') - assert_that( - do_arxiv_urlize('https://example.com/'), - equal_to('this https URL'), - 'do_arxiv_urlize (URL linking) 2/6') - assert_that( - do_arxiv_urlize('ftp://example.com/'), - equal_to('this ftp URL'), - 'do_arxiv_urlize (URL linking) 3/6') - - - assert_that( - do_arxiv_urlize( - 'http://projecteuclid.org/euclid.bj/1151525136' - ), - equal_to('this http URL'), - 'do_arxiv_urlize (URL linking) 6/6') - # assert_that( - # do_arxiv_urlize( - # ' Correction to Bernoulli (2006), 12, 551--570 http://projecteuclid.org/euclid.bj/1151525136'), - # equal_to( - # ' Correction to Bernoulli (2006), 12, 551--570 this http URL'), - # 'do_arxiv_urlize (URL linking) 6/6') - # shouldn't match - assert_that( - do_arxiv_urlize('2448446.4710(5)'), '2448446.4710(5)', - equal_to('do_arxiv_urlize (should not match) 1/9')) - self.assertEqual( - do_arxiv_urlize('HJD=2450274.4156+/-0.0009'), - 'HJD=2450274.4156+/-0.0009', - 'do_arxiv_urlize (should not match) 2/9') - assert_that( - do_arxiv_urlize( - 'T_min[HJD]=49238.83662(14)+0.146352739(11)E.'), - equal_to('T_min[HJD]=49238.83662(14)+0.146352739(11)E.'), - 'do_arxiv_urlize (should not match) 3/9') - assert_that( - do_arxiv_urlize('Pspin=1008.3408s'), 'Pspin=1008.3408s', - equal_to('do_arxiv_urlize (should not match) 4/9')) - assert_that( - do_arxiv_urlize('2453527.87455^{+0.00085}_{-0.00091}'), - equal_to('2453527.87455^{+0.00085}_{-0.00091}'), - 'do_arxiv_urlize (should not match) 5/9') - assert_that( - do_arxiv_urlize('2451435.4353'), equal_to('2451435.4353'), - 'do_arxiv_urlize (should not match) 6/9') - - - assert_that( - do_arxiv_urlize('cond-mat/97063007'), - equal_to( - 'cond-mat/97063007'), - 'do_arxiv_urlize (should match) 7/9') - - assert_that( - do_arxiv_urlize( - '[http://onion.com/something-funny-about-arxiv-1234]'), - equal_to('[this http URL]')) - - assert_that( - do_arxiv_urlize( - '[http://onion.com/?q=something-funny-about-arxiv.1234]'), - equal_to('[this http URL]')) - - assert_that( - do_arxiv_urlize('http://onion.com/?q=something funny'), - equal_to( - 'this http URL funny'), - 'Spaces CANNOT be expected to be part of URLs') - - assert_that( - do_arxiv_urlize( - '"http://onion.com/something-funny-about-arxiv-1234"'), - equal_to( - Markup('"this http URL"')), - 'Should handle URL surrounded by double quotes') - - assert_that( - do_arxiv_urlize('< http://example.com/1<2 ><'), - equal_to('< this http URL<2 ><'), - 'do_arxiv_urlize (URL linking) 5/6') - - assert_that( - do_arxiv_urlize('Accepted for publication in A&A. The data will be available via CDS, and can be found "http://atlasgal.mpifr-bonn.mpg.de/cgi-bin/ATLASGAL_FILAMENTS.cgi"'), - equal_to('Accepted for publication in A&A. The data will be available via CDS, and can be found "this http URL"') - ) - - assert_that( - do_arxiv_urlize('see http://www.tandfonline.com/doi/abs/doi:10.1080/15980316.2013.860928?journalCode=tjid20'), - equal_to('see this http URL') - ) - - assert_that( - do_arxiv_urlize('http://authors.elsevier.com/a/1TcSd,Ig45ZtO'), - equal_to('this http URL')) - - def category_id_test(self): - def do_arxiv_urlize(txt): - return do_dois_id_urls_to_tags(None,None, txt) - - assert_that( - do_arxiv_urlize('version of arXiv.math.GR/0512484 (2011).'), - equal_to('version of arXiv.math.GR/0512484 (2011).')) - - def hosts_tests(self): - def do_arxiv_urlize(txt): - return do_dois_id_urls_to_tags(None,None, txt) - - assert_that(do_arxiv_urlize('can be downloaded from http://rwcc.bao.ac.cn:8001/swap/NLFFF_DBIE_code/HeHan_NLFFF_JGR.pdf'), - equal_to("can be downloaded from this http URL"), - "Should deal with ports correctly") - - - assert_that(do_arxiv_urlize("images is at http://85.20.11.14/hosting/punsly/APJLetter4.2.07/"), - equal_to('images is at this http URL'), - "should deal with numeric IP correctly") - - def urls_with_plus(self): - def do_arxiv_urlize(txt): - return do_dois_id_urls_to_tags(lambda x: x, lambda x:x, txt) - - assert_that(do_arxiv_urlize('http://www.fkf.mpg.de/andersen/docs/pub/abstract2004+/pavarini_02.pdf'), - equal_to("this http URL"), - "Should deal with plus in URL correctly") - - def anchors_with_slash(self): - def do_arxiv_urlize(txt): - return do_dois_id_urls_to_tags(lambda x: x, lambda x:x, txt) - - assert_that(do_arxiv_urlize('https://dms.sztaki.hu/ecml-pkkd-2016/#/app/privateleaderboard'), - equal_to(Markup("this https URL")), - "Should deal with slash in URL anchor correctly") - - def ftp_test(self): - cmt = "7 Pages; ftp://ftp%40micrognu%2Ecom:anon%40anon@ftp.micrognu.com/pnenp/conclusion.pdf" - def do_arxiv_urlize(txt): - return do_dois_id_urls_to_tags(lambda x: x, lambda x:x, txt) - - assert_that(do_arxiv_urlize(cmt), - equal_to(Markup('7 Pages; this ftp URL'))) - - - def arxiv_prefix_test(self): - - def do_arxiv_urlize(txt): - return do_dois_id_urls_to_tags(lambda x: x, lambda x:x, txt) - - cmt = "see arxiv:1201.12345" - assert_that(do_arxiv_urlize(cmt), - equal_to(Markup('see arXiv:1201.12345'))) diff --git a/tests/test_list_page.py b/tests/test_list_page.py new file mode 100644 index 000000000..8ecee2f9c --- /dev/null +++ b/tests/test_list_page.py @@ -0,0 +1,485 @@ +import unittest +import re +from hamcrest import * +from unittest.mock import MagicMock + +from bs4 import BeautifulSoup + +from tests.test_abs_parser import ABS_FILES +from browse.services.document.metadata import AbsMetaSession +from browse.domain.license import ASSUMED_LICENSE_URI +from browse.services.listing.fake_listings import FakeListingFilesService +from browse.services.listing import ListingService, get_listing_service +import os + +from app import app + + +class ListPageTest(unittest.TestCase): + + def setUp(self): + app.testing = True + app.config['APPLICATION_ROOT'] = '' + self.app = app + self.client = app.test_client() + + def test_basic_lists(self): + rv = self.client.get('/list/hep-ph/0901') + self.assertEqual(rv.status_code, 200) + self.assertNotEqual(rv.headers.get('Expires', None), None) + + rv = self.client.get('/list/hep-ph/09') + self.assertEqual(rv.status_code, 200) + self.assertNotEqual(rv.headers.get('Expires', None), None) + + rv = self.client.get('/list/hep-ph/new') + self.assertEqual(rv.status_code, 200) + self.assertNotEqual(rv.headers.get('Expires', None), None) + + rv = self.client.get('/list/hep-ph/current') + self.assertEqual(rv.status_code, 200) + self.assertNotEqual(rv.headers.get('Expires', None), None) + + rv = self.client.get('/list/hep-ph/pastweek') + self.assertEqual(rv.status_code, 200) + self.assertNotEqual(rv.headers.get('Expires', None), None) + + rv = self.client.get('/list/hep-ph/recent') + self.assertEqual(rv.status_code, 200) + self.assertNotEqual(rv.headers.get('Expires', None), None) + + rv = self.client.get('/list/hep-ph/0901?skip=925&show=25') + self.assertEqual(rv.status_code, 200) + self.assertNotEqual(rv.headers.get('Expires', None), None) + + rv = self.client.get('/list/astro-ph/04') + self.assertEqual(rv.status_code, 200) + self.assertNotEqual(rv.headers.get('Expires', None), None) + + rv = self.client.get('/list/math/92') + self.assertEqual(rv.status_code, 200) + self.assertNotEqual(rv.headers.get('Expires', None), None) + + rv = self.client.get('/list/math/9201') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0101') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0102') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0103') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0104') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0105') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0106') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0107') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0108') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0109') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0110') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0111') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0112') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/01') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/18') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/20') # year 2020 + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/30') # year 2030 + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/200101') + self.assertEqual(rv.status_code, 200) + + def test_listing_authors(self): + rv = self.client.get('/list/hep-ph/0901') + self.assertEqual(rv.status_code, 200) + au = b'Eqab M. Rabei' + assert au in rv.data, f'Simple check for author {au} in response.' + + html = BeautifulSoup(rv.data.decode('utf-8'), 'html.parser') + + auDivs = html.find_all('div', 'list-authors') + assert_that(auDivs, not_none()) + assert_that(len(auDivs), greater_than( + 5), 'Should have some .list-author divs') + + first_aus = auDivs[0].find_all('a') + assert_that(first_aus, has_length(4), + 'expect 4 tags for first artcile "Fractional WKB Approximation"') + + assert_that(first_aus[0].get_text(), equal_to('Eqab M. Rabei')) + assert_that(first_aus[1].get_text(), + equal_to('Ibrahim M. A. Altarazi')) + assert_that(first_aus[2].get_text(), equal_to('Sami I. Muslih')) + assert_that(first_aus[3].get_text(), equal_to('Dumitru Baleanu')) + + assert_that(auDivs[0].get_text(), is_not(contains_string(' ,')), + 'Should not have a comma with a space in front of it') + + def test_paging_first(self): + rv = self.client.get('/list/hep-ph/0901') + self.assertEqual(rv.status_code, 200) + + rvdata = rv.data.decode('utf-8') + html = BeautifulSoup(rvdata, 'html.parser') + + paging = html.find(id='dlpage').find_all('div')[0] + assert_that(paging, not_none()) + tgs = paging.find_all(['span', 'a']) + assert_that(tgs, not_none()) + assert_that(len(tgs), 6) + + assert_that(tgs[0].name, equal_to('span')) + assert_that(tgs[0].get_text(), equal_to('1-25')) + + assert_that(tgs[1].name, equal_to('a')) + assert_that(tgs[1].get_text(), equal_to('26-50')) + + assert_that(tgs[2].name, equal_to('a')) + assert_that(tgs[2].get_text(), equal_to('51-75')) + + assert_that(tgs[3].name, equal_to('a')) + assert_that(tgs[3].get_text(), equal_to('76-100')) + + assert_that(tgs[4].name, equal_to('span')) + assert_that(tgs[4].get_text(), equal_to('...')) + + assert_that(tgs[5].name, equal_to('a')) + assert_that(tgs[5].get_text(), equal_to('1001-1001')) + + # find the first article index tag + first_index_atag = html.find(id='articles').find_all( + 'dt')[0].find('a', string=re.compile(r'\[\d*\]')) + assert_that(first_index_atag, not_none()) + assert_that(first_index_atag['name'], equal_to('item1')) + assert_that(first_index_atag.string, equal_to('[1]')) + + def test_paging_second(self): + rv = self.client.get('/list/hep-ph/0901?skip=25&show=25') + self.assertEqual(rv.status_code, 200) + + rvdata = rv.data.decode('utf-8') + html = BeautifulSoup(rvdata, 'html.parser') + + paging = html.find(id='dlpage').find_all('div')[0] + assert_that(paging, not_none()) + tgs = paging.find_all(['span', 'a']) + assert_that(tgs, not_none()) + assert_that(len(tgs), 7) + + assert_that(tgs[0].name, equal_to('a')) + assert_that(tgs[0].get_text(), equal_to('1-25')) + + assert_that(tgs[1].name, equal_to('span')) + assert_that(tgs[1].get_text(), equal_to('26-50')) + + assert_that(tgs[2].name, equal_to('a')) + assert_that(tgs[2].get_text(), equal_to('51-75')) + + assert_that(tgs[3].name, equal_to('a')) + assert_that(tgs[3].get_text(), equal_to('76-100')) + + assert_that(tgs[4].name, equal_to('a')) + assert_that(tgs[4].get_text(), equal_to('101-125')) + + assert_that(tgs[5].name, equal_to('span')) + assert_that(tgs[5].get_text(), equal_to('...')) + + assert_that(tgs[6].name, equal_to('a')) + assert_that(tgs[6].get_text(), equal_to('1001-1001')) + + # find the first article index tag + first_index_atag = html.find(id='articles').find_all( + 'dt')[0].find('a', string=re.compile(r'\[\d*\]')) + assert_that(first_index_atag, not_none()) + assert_that(first_index_atag['name'], is_not( + 'item1'), 'first item index should not be 1') + assert_that(first_index_atag.string, equal_to('[26]')) + + def test_paging_middle(self): + rv = self.client.get('/list/hep-ph/0901?skip=175&show=25') + self.assertEqual(rv.status_code, 200) + + rvdata = rv.data.decode('utf-8') + html = BeautifulSoup(rvdata, 'html.parser') + + paging = html.find(id='dlpage').find_all('div')[0] + assert_that(paging, not_none()) + tgs = paging.find_all(['span', 'a']) + assert_that(tgs, not_none()) + assert_that(len(tgs), 7) + + assert_that(tgs[0].name, equal_to('a')) + assert_that(tgs[0].get_text(), equal_to('1-25')) + + assert_that(tgs[1].name, equal_to('span')) + assert_that(tgs[1].get_text(), equal_to('...')) + + assert_that(tgs[2].name, equal_to('a')) + assert_that(tgs[2].get_text(), equal_to('101-125')) + + assert_that(tgs[3].name, equal_to('a')) + assert_that(tgs[3].get_text(), equal_to('126-150')) + + assert_that(tgs[4].name, equal_to('a')) + assert_that(tgs[4].get_text(), equal_to('151-175')) + + assert_that(tgs[5].name, equal_to('span')) + assert_that(tgs[5].get_text(), equal_to('176-200')) + + assert_that(tgs[6].name, equal_to('a')) + assert_that(tgs[6].get_text(), equal_to('201-225')) + + assert_that(tgs[7].name, equal_to('a')) + assert_that(tgs[7].get_text(), equal_to('226-250')) + + assert_that(tgs[8].name, equal_to('a')) + assert_that(tgs[8].get_text(), equal_to('251-275')) + + assert_that(tgs[9].name, equal_to('span')) + assert_that(tgs[9].get_text(), equal_to('...')) + + assert_that(tgs[10].name, equal_to('a')) + assert_that(tgs[10].get_text(), equal_to('1001-1001')) + + # find the first article index tag + first_index_atag = html.find(id='articles').find_all( + 'dt')[0].find('a', string=re.compile(r'\[\d*\]')) + assert_that(first_index_atag, not_none()) + assert_that(first_index_atag['name'], is_not( + 'item1'), 'first item index should not be 1') + assert_that(first_index_atag.string, equal_to('[176]')) + + def test_paging_last(self): + rv = self.client.get('/list/hep-ph/0901?skip=1000&show=25') + self.assertEqual(rv.status_code, 200) + + rvdata = rv.data.decode('utf-8') + html = BeautifulSoup(rvdata, 'html.parser') + + paging = html.find(id='dlpage').find_all('div')[0] + assert_that(paging, not_none()) + tgs = paging.find_all(['span', 'a']) + assert_that(tgs, not_none()) + assert_that(len(tgs), 7) + + assert_that(tgs[0].name, equal_to('a')) + assert_that(tgs[0].get_text(), equal_to('1-25')) + + assert_that(tgs[1].name, equal_to('span')) + assert_that(tgs[1].get_text(), equal_to('...')) + + assert_that(tgs[2].name, equal_to('a')) + assert_that(tgs[2].get_text(), equal_to('926-950')) + + assert_that(tgs[3].name, equal_to('a')) + assert_that(tgs[3].get_text(), equal_to('951-975')) + + assert_that(tgs[4].name, equal_to('a')) + assert_that(tgs[4].get_text(), equal_to('976-1000')) + + assert_that(tgs[5].name, equal_to('span')) + assert_that(tgs[5].get_text(), equal_to('1001-1001')) + + # find the first article index tag + first_index_atag = html.find(id='articles').find_all( + 'dt')[0].find('a', string=re.compile(r'\[\d*\]')) + assert_that(first_index_atag, not_none()) + assert_that(first_index_atag['name'], is_not( + 'item1'), 'first item index should not be 1') + assert_that(first_index_atag.string, equal_to('[1001]')) + + def test_paging_penultimate(self): + rv = self.client.get('/list/hep-ph/0901?skip=975&show=25') + self.assertEqual(rv.status_code, 200) + + rvdata = rv.data.decode('utf-8') + html = BeautifulSoup(rvdata, 'html.parser') + + paging = html.find(id='dlpage').find_all('div')[0] + assert_that(paging, not_none()) + tgs = paging.find_all(['span', 'a']) + assert_that(tgs, not_none()) + assert_that(len(tgs), 7) + + assert_that(tgs[0].name, equal_to('a')) + assert_that(tgs[0].get_text(), equal_to('1-25')) + + assert_that(tgs[1].name, equal_to('span')) + assert_that(tgs[1].get_text(), equal_to('...')) + + assert_that(tgs[2].name, equal_to('a')) + assert_that(tgs[2].get_text(), equal_to('901-925')) + + assert_that(tgs[3].name, equal_to('a')) + assert_that(tgs[3].get_text(), equal_to('926-950')) + + assert_that(tgs[4].name, equal_to('a')) + assert_that(tgs[4].get_text(), equal_to('951-975')) + + assert_that(tgs[5].name, equal_to('span')) + assert_that(tgs[5].get_text(), equal_to('976-1000')) + + assert_that(tgs[6].name, equal_to('a')) + assert_that(tgs[6].get_text(), equal_to('1001-1001')) + + # find the first article index tag + first_index_atag = html.find(id='articles').find_all( + 'dt')[0].find('a', string=re.compile(r'\[\d*\]')) + assert_that(first_index_atag, not_none()) + assert_that(first_index_atag['name'], is_not( + 'item1'), 'first item index should not be 1') + assert_that(first_index_atag.string, equal_to('[976]')) + + def test_paging_925(self): + rv = self.client.get('/list/hep-ph/0901?skip=925&show=25') + self.assertEqual(rv.status_code, 200) + + rvdata = rv.data.decode('utf-8') + html = BeautifulSoup(rvdata, 'html.parser') + + paging = html.find(id='dlpage').find_all('div')[0] + assert_that(paging, not_none()) + tgs = paging.find_all(['span', 'a']) + assert_that(tgs, not_none()) + assert_that(len(tgs), 7) + + assert_that(tgs[0].name, equal_to('a')) + assert_that(tgs[0].get_text(), equal_to('1-25')) + + assert_that(tgs[1].name, equal_to('span')) + assert_that(tgs[1].get_text(), equal_to('...')) + + assert_that(tgs[2].name, equal_to('a')) + assert_that(tgs[2].get_text(), equal_to('851-875')) + + assert_that(tgs[3].name, equal_to('a')) + assert_that(tgs[3].get_text(), equal_to('876-900')) + + assert_that(tgs[4].name, equal_to('a')) + assert_that(tgs[4].get_text(), equal_to('901-925')) + + assert_that(tgs[5].name, equal_to('span')) + assert_that(tgs[5].get_text(), equal_to('926-950')) + + assert_that(tgs[6].name, equal_to('a')) + assert_that(tgs[6].get_text(), equal_to('951-975')) + + assert_that(tgs[7].name, equal_to('a')) + assert_that(tgs[7].get_text(), equal_to('976-1000')) + + assert_that(tgs[8].name, equal_to('a')) + assert_that(tgs[8].get_text(), equal_to('1001-1001')) + + # find the first article index tag + first_index_atag = html.find(id='articles').find_all( + 'dt')[0].find('a', string=re.compile(r'\[\d*\]')) + assert_that(first_index_atag, not_none()) + assert_that(first_index_atag['name'], is_not( + 'item1'), 'first item index should not be 1') + assert_that(first_index_atag.string, equal_to('[926]')) + + def test_odd_requests(self): + rv = self.client.get('/list/hep-ph/0901?skip=925&show=1000000') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/hep-ph/bogusTimePeriod') + self.assertNotEqual(rv.status_code, 200) + + rv = self.client.get('/list/junkarchive') + self.assertNotEqual(rv.status_code, 200) + + rv = self.client.get('/list/ao-si/0901?skip=925&show=25') + self.assertNotEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0100') + self.assertNotEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0113') + self.assertNotEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/0199') + self.assertNotEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/200199') + self.assertNotEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/2') + self.assertNotEqual(rv.status_code, 200) + + rv = self.client.get('/list/math/2001999999') + self.assertNotEqual(rv.status_code, 200) + + def test_abs_service(self): + service = ListingService() + assert_that(calling(service.list_articles_by_year).with_args('a', 1, 1, 1, 1), + raises(NotImplementedError)) + assert_that(calling(service.list_articles_by_month).with_args('a', 1, 1, 1, 1), + raises(NotImplementedError)) + assert_that(calling(service.list_new_articles).with_args('a', 1, 1), + raises(NotImplementedError)) + assert_that(calling(service.list_pastweek_articles).with_args('a', 1, 1), + raises(NotImplementedError)) + + assert_that(service.version(), is_not(None)) + + def test_not_modified_from_listing_service(self): + with self.app.app_context(): + flservice = get_listing_service() + flservice.list_new_articles = MagicMock(return_value={'not_modified': True, + 'expires': 'Wed, 21 Oct 2015 07:28:00 GMT'}) + rv = self.client.get('/list/hep-ph/new') + self.assertEqual( + rv.status_code, 304, '/list controller should return 304 when service indicates not-modified') + + flservice.list_pastweek_articles = MagicMock(return_value={'not_modified': True, + 'expires': 'Wed, 21 Oct 2015 07:28:00 GMT'}) + rv = self.client.get('/list/hep-ph/recent') + self.assertEqual( + rv.status_code, 304, '/list controller should return 304 when service indicates not-modified') + rv = self.client.get('/list/hep-ph/pastweek') + self.assertEqual( + rv.status_code, 304, '/list controller should return 304 when service indicates not-modified') + + flservice.list_articles_by_month = MagicMock(return_value={'not_modified': True, + 'expires': 'Wed, 21 Oct 2015 07:28:00 GMT'}) + rv = self.client.get('/list/hep-ph/1801') + self.assertEqual( + rv.status_code, 304, '/list controller should return 304 when service indicates not-modified') + + flservice.list_articles_by_year = MagicMock(return_value={'not_modified': True, + 'expires': 'Wed, 21 Oct 2015 07:28:00 GMT'}) + rv = self.client.get('/list/hep-ph/18') + self.assertEqual( + rv.status_code, 304, '/list controller should return 304 when service indicates not-modified') + + def test_list_called_from_archive(self): + rv = self.client.get('/list/?archive=hep-ph&year=08&month=03&submit=Go') + self.assertEqual(rv.status_code, 200) + + rv = self.client.get('/list/?archive=hep-ph&year=08&month=all&submit=Go') + self.assertEqual(rv.status_code, 200) diff --git a/tests/test_search_authors.py b/tests/test_search_authors.py index 8fed019dc..b46be69c1 100644 --- a/tests/test_search_authors.py +++ b/tests/test_search_authors.py @@ -3,14 +3,14 @@ from unittest import TestCase from browse.domain import metadata -from browse.services.document.author_affil import split_authors +from arxiv.util.authors import split_authors from browse.services.document.metadata import AbsMetaSession from browse.services.search.search_authors import queries_for_authors, split_long_author_list from tests import path_of_for_test class TestAuthorLinkCreation(TestCase): - + def test_basic(self): out = queries_for_authors('') self.assertIsInstance(out, list) diff --git a/tests/test_tex2utf.py b/tests/test_tex2utf.py deleted file mode 100644 index 5ec27cd24..000000000 --- a/tests/test_tex2utf.py +++ /dev/null @@ -1,204 +0,0 @@ -"""Tests for Tex to UTF8 parsing.""" -from unittest import TestCase - -from browse.services.util.tex2utf import tex2utf - - -class TextTex2Utf(TestCase): - - def test_tex2utf(self): - test_str = "abc def ghijk lmnop qrs tuv wxyz 1234567890 !@# $%^ &* () _-=+" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, test_str) - - utf_out = tex2utf(test_str, letters=False) - self.assertEqual(utf_out, test_str) - - test_str = "\\'e" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, chr(0xe9)) - #\'etex2UTFé - - test_str = "\\'E" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, chr( - 0xc9)) - - test_str = "\\'E" - utf_out = tex2utf(test_str,letters=True) - self.assertEqual(utf_out, chr( - 0xc9)) - - test_str = "\\'E" - utf_out = tex2utf(test_str,letters=False) - self.assertEqual(utf_out, chr(0xc9)) - - - test_str = "\\'E" - utf_out = tex2utf(test_str,False) - self.assertEqual(utf_out, chr(0xc9)) - - # single textsymbol - test_str = '\\OE' - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, chr(0x0152)) - - # single textsymbol followed by newline - test_str = "\\OE\n" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, chr(0x0152) + "\n") - - # test_string of textsymbols - test_str = "\\OE\\S" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, chr(0x0152) + chr(0x00a7)) - - # test_string of textsymbols followed by newline - test_str = "\\OE\\S\n" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, chr(0x0152) + chr( - 0x00a7) + "\n") - - # combination of textlet and textsymbols with whitespace as separator - test_str = "\\ddag \\OE\\S" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, chr(0x2021) + chr(0x0152) + chr( - 0x00a7)) - - # single greek textlet - test_str = '\\alpha' - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, chr( - 0x03b1)) - - test_str = '\\alpha' - utf_out = tex2utf(test_str,True) - self.assertEqual(utf_out, chr( - 0x03b1)) - - test_str = '\\alpha' - utf_out = tex2utf(test_str,False) - self.assertEqual(utf_out, r'\alpha') - - # simple test_string of greek - test_str = '\\alpha\\beta\gamma' - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, chr(0x03b1) + chr(0x03b2) + chr( - 0x03b3)) - - # test_string of greek textlet with nested curlies - test_str = '\\alpha{\\beta{\gamma}}' - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, chr( - 0x03b1) + '{' + chr(0x03b2) + chr(0x03b3) + '}') - - # another test_string of greek with nested curlies - test_str = '\\alpha{\\beta{\gamma}\macro}' - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, chr( - 0x03b1) + '{' + chr(0x03b2) + chr(0x03b3) + '\macro}') - - # use "\ " as textlet delimiter - test_str = 'foo \\alpha\ bar' - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'foo ' + chr(0x03b1) + - ' bar') - - # use "\ " as textlet delimiter - test_str = '\\alpha\ \\beta{something}' - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, chr(0x03b1) + ' ' + - chr(0x03b2) + '{something}') - - # use "\ " as textlet delimiter - test_str = 'foo \OE\ bar' - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'foo ' + chr(0x0152) + - ' bar') - - # use " " as textlet delimiter - test_str = 'foo \\alpha bar' - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'foo ' + chr(0x03b1) + 'bar') - - # use empty "{}" as textlet delimiter - test_str = 'foo \\alpha{}bar' - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'foo ' + chr(0x03b1) + 'bar') - - # textlet followed by non-empty "{ + + }" - test_str = 'foo \\alpha{-}bar' - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'foo ' + chr(0x03b1) + - '{-}bar') - - # textlet followed by underscore (for subscript) - test_str = 'foo \\alpha_7' - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'foo ' + chr(0x03b1) + - '_7') - - def test_tex2utf_underscore(self): - # textlet followed by underscore (for subscript) - test_str = 'foo \\alpha_\\beta_\\gamma_7' - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'foo ' + chr(0x03b1) + '_' + chr(0x03b2) + '_' + chr(0x03b3) + '_7') - - # \'Etex2UTFÉ - def test_tex2utf_curly(self): - test_str = "\\'{e}" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, chr( - 0xe9)) - - #\'{e}tex2UTFé - #{\'e}tex2UTF{é} - - def test_ARXIVDEV2322fixes(self): - test_str = "ARXIVDEV-2322 \\u{A} fix" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'ARXIVDEV-2322 ' + chr(0x102) + ' fix') - - test_str = "ARXIVDEV-2322 \\u{a} fix" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'ARXIVDEV-2322 ' + chr(0x103) + ' fix') - - test_str = "ARXIVDEV-2322 \\u{O} fix" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'ARXIVDEV-2322 ' + chr(0x14e) + ' fix') - - test_str = "ARXIVDEV-2322 \\u{o} fix" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'ARXIVDEV-2322 ' + chr(0x14f) + ' fix') - - test_str = "ARXIVDEV-2322 \\k{i} fix" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'ARXIVDEV-2322 ' + chr(0x12f) + ' fix') - - test_str = "ARXIVDEV-2322 \\v{g} fix" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'ARXIVDEV-2322 ' + chr(0x1e7) + ' fix') - - test_str = "ARXIVDEV-2322 \\c{g} fix" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'ARXIVDEV-2322 ' + chr(0x123) + ' fix') - - test_str = "ARXIVDEV-2322 \\DJ fix" - utf_out = tex2utf(test_str) - self.assertEqual(utf_out, 'ARXIVDEV-2322 ' + chr(0x110) + ' fix') - - # (tex, err) = arXiv:: Filters: : Tex2UTF: : UTF2tex("Test Test_String. \x{03bb}" ) - # self.assertEqual(tex, - # "Test Test_String. {\\lambda}", - # "arXiv::Filters::Tex2UTF::UTF2tex()") - # - # self.assertEqual(arXiv:: Filters: : Tex2UTF: : escapeUTF8("\x{03bb}"), - # "λ", - # 'arXiv::Filters::Tex2UTF::escapeUTF8("\x{03bb}")') - # - # latin1 = "\x{91}\x{92}\x{93}\x{94}\x{96}\x{97}\x{98}\x{A0}\x{A6}\x{B1}\x{B2}\x{B3}\x{B5}\x{BC}\x{BD}\x{BE}" - # latin1expected = "`'\"\"----~ |{\\pm}^2^3{\\mu}1/41/23/4" - # is (arXiv: : Filters: : Tex2UTF: : latin2tex("Test Test_String. \x{03bb} latin1" ), - # "Test Test_String. {\\lambda} latin1expected", - # "arXiv::Filters::Tex2UTF::latin2tex(`'\"\"----~ |{\\pm}^2^3{\\mu}1/41/23/4)") - # done_testing diff --git a/tests/test_typings.py b/tests/test_typings.py index 64880a542..7dccbf3ab 100644 --- a/tests/test_typings.py +++ b/tests/test_typings.py @@ -11,6 +11,7 @@ class MyPyTest(TestCase): """Class for testing modules with mypy.""" + @unittest.skip("Type checkes skipped due to failing with out of memory on travis") def test_run_mypy_module(self) -> None: """Run mypy on all module sources.""" mypy_call: List[str] = ["mypy"] + self.mypy_opts + ["-p", self.pkgname] @@ -18,6 +19,7 @@ def test_run_mypy_module(self) -> None: mypy_call, env=os.environ, cwd=self.pypath) self.assertEqual(result, 0, f'mypy on {self.pkgname}') + @unittest.skip("Type checkes skipped due to failing with out of memory on travis") def test_run_mypy_tests(self) -> None: """Run mypy on all tests in module under the tests directory.""" diff --git a/update-docs.sh b/update-docs.sh new file mode 100644 index 000000000..4ca2623c4 --- /dev/null +++ b/update-docs.sh @@ -0,0 +1,32 @@ +#!/bin/bash +SRCDOCS=`pwd`/docs/source/_build/html +REPO=arXiv/arxiv-browse +echo $SRCDOCS + +cd `pwd`/docs +make html + +cd $SRCDOCS +MSG="Adding gh-pages docs for `git log -1 --pretty=short --abbrev-commit`" + +TMPREPO=/tmp/docs/$REPO +rm -rf $TMPREPO +mkdir -p -m 0755 $TMPREPO +echo $MSG + +git clone git@github.com:$REPO.git $TMPREPO +cd $TMPREPO + +## checkout the branch if it exists, if not then create it and detach it from the history +if ! git checkout gh-pages; then + git checkout --orphan gh-pages + git rm -rf . + touch .nojekyll + git add .nojekyll +else + git checkout gh-pages ###gh-pages has previously one off been set to be nothing but html +fi + +cp -r $SRCDOCS/* $TMPREPO +git add -A +git commit -m "$MSG" && git push origin gh-pages diff --git a/wsgi.py b/wsgi.py index 201fc11c0..fb10c6d6e 100644 --- a/wsgi.py +++ b/wsgi.py @@ -1,12 +1,53 @@ """Web Server Gateway Interface (WSGI) entry-point.""" +import os from browse.factory import create_web_app -import os +# We need someplace to keep the flask app around between requests. +# Double underscores excludes this from * imports. +__flask_app__ = None def application(environ, start_response): - """WSGI application factory.""" - for key, value in environ.items(): - os.environ[key] = str(value) - app = create_web_app() - return app(environ, start_response) + """WSGI application, called onece for each HTTP request. + + application() will be called once for each HTTP request. WSGI has + no initialization lifecycle phase. This code will only get run + with a HTTP request in the environ. + + The Flask app should be reused across requests. Creating the + Flask app for each request showed up as a problem in 2019 where + SQLAlchemy connection pooling seemed to be disabled because a new + SQLAlchemy DB was created for each requsest. + + Apache httpd passes config from SetEnv directives via the request + environ. We currently have a use case of running apache HTTPD + with mod_wsgi and setting environment variables for Flask apps + with apache's SetEnv directive. SetEnv does not seem to set an OS + environment variable that is perserved in the WSGI deamon + process. SetEnv values are passed to WSGI application() in the + environ agrument. + + This will not be needed once each app is on docker+enginx. + """ + + # Copy string WSGI envrion to os.environ. This is to get apache + # SetEnv vars. It needs to be done before the call to + # create_web_app() due to how config is setup from os in + # browse/config.py. + for key, value in environ.items(): + if type(value) is str: + os.environ[key] = value + + # 'global' actually means module scope, and that is exactly what + # we want here. + # + # Python docs are thin. I'm seeing this sort of thing on + # stackoverflow: "In Python there is no such thing as absolute + # globals automatically defined across all namespaces + # (thankfully). As you correctly pointed out, a global is bound to + # a namespace within a module..." + global __flask_app__ + if __flask_app__ is None: + __flask_app__ = create_web_app() + + return __flask_app__(environ, start_response)