From 8f10873fa3e6d91bbb2396485d159a27c8e8e775 Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Thu, 3 Oct 2019 08:24:32 -0400 Subject: [PATCH] Initial commit --- .gitignore | 63 + Pipfile | 17 + Pipfile.lock | 495 ++ README.md | 117 + data/.keep | 0 notebooks/analyze-fcc-comments.ipynb | 5491 ++++++++++++++++++ notebooks/analyze-mb-comment-structure.ipynb | 2267 ++++++++ 7 files changed, 8450 insertions(+) create mode 100644 .gitignore create mode 100644 Pipfile create mode 100644 Pipfile.lock create mode 100644 README.md create mode 100644 data/.keep create mode 100644 notebooks/analyze-fcc-comments.ipynb create mode 100644 notebooks/analyze-mb-comment-structure.ipynb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d6e556f --- /dev/null +++ b/.gitignore @@ -0,0 +1,63 @@ +# Custom list: +.ipynb_checkpoints +.DS_Store + +#### joe made this: http://goel.io/joe + +#####=== Python ===##### + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..8778c61 --- /dev/null +++ b/Pipfile @@ -0,0 +1,17 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] + +[packages] +requests = "*" +pandas = "*" +jupyter = "*" +requests-cache = "*" +tqdm = "*" +nbexec = "*" + +[requires] +python_version = "3.6" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..3b9bf3c --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,495 @@ +{ + "_meta": { + "hash": { + "sha256": "682395d97dfd62d238e9bd70cf5d6cab49754a43ce3d5acac41efd94b6c1ac6e" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.6" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "appnope": { + "hashes": [ + "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0", + "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71" + ], + "markers": "sys_platform == 'darwin'", + "version": "==0.1.0" + }, + "attrs": { + "hashes": [ + "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", + "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" + ], + "version": "==19.1.0" + }, + "backcall": { + "hashes": [ + "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4", + "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2" + ], + "version": "==0.1.0" + }, + "bleach": { + "hashes": [ + "sha256:213336e49e102af26d9cde77dd2d0397afabc5a6bf2fed985dc35b5d1e285a16", + "sha256:3fdf7f77adcf649c9911387df51254b813185e32b2c6619f690b593a617e19fa" + ], + "version": "==3.1.0" + }, + "certifi": { + "hashes": [ + "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50", + "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef" + ], + "version": "==2019.9.11" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "version": "==3.0.4" + }, + "decorator": { + "hashes": [ + "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de", + "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6" + ], + "version": "==4.4.0" + }, + "defusedxml": { + "hashes": [ + "sha256:6687150770438374ab581bb7a1b327a847dd9c5749e396102de3fad4e8a3ef93", + "sha256:f684034d135af4c6cbb949b8a4d2ed61634515257a67299e5f940fbaa34377f5" + ], + "version": "==0.6.0" + }, + "entrypoints": { + "hashes": [ + "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19", + "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451" + ], + "version": "==0.3" + }, + "idna": { + "hashes": [ + "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", + "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" + ], + "version": "==2.8" + }, + "ipykernel": { + "hashes": [ + "sha256:167c3ef08450f5e060b76c749905acb0e0fbef9365899377a4a1eae728864383", + "sha256:b503913e0b4cce7ed2de965457dfb2edd633e8234161a60e23f2fe2161345d12" + ], + "version": "==5.1.2" + }, + "ipython": { + "hashes": [ + "sha256:c4ab005921641e40a68e405e286e7a1fcc464497e14d81b6914b4fd95e5dee9b", + "sha256:dd76831f065f17bddd7eaa5c781f5ea32de5ef217592cf019e34043b56895aa1" + ], + "markers": "python_version >= '3.3'", + "version": "==7.8.0" + }, + "ipython-genutils": { + "hashes": [ + "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8", + "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8" + ], + "version": "==0.2.0" + }, + "ipywidgets": { + "hashes": [ + "sha256:13ffeca438e0c0f91ae583dc22f50379b9d6b28390ac7be8b757140e9a771516", + "sha256:e945f6e02854a74994c596d9db83444a1850c01648f1574adf144fbbabe05c97" + ], + "version": "==7.5.1" + }, + "jedi": { + "hashes": [ + "sha256:786b6c3d80e2f06fd77162a07fed81b8baa22dde5d62896a790a331d6ac21a27", + "sha256:ba859c74fa3c966a22f2aeebe1b74ee27e2a462f56d3f5f7ca4a59af61bfe42e" + ], + "version": "==0.15.1" + }, + "jinja2": { + "hashes": [ + "sha256:065c4f02ebe7f7cf559e49ee5a95fb800a9e4528727aec6f24402a5374c65013", + "sha256:14dd6caf1527abb21f08f86c784eac40853ba93edb79552aa1e4b8aef1b61c7b" + ], + "version": "==2.10.1" + }, + "jsonschema": { + "hashes": [ + "sha256:5f9c0a719ca2ce14c5de2fd350a64fd2d13e8539db29836a86adc990bb1a068f", + "sha256:8d4a2b7b6c2237e0199c8ea1a6d3e05bf118e289ae2b9d7ba444182a2959560d" + ], + "version": "==3.0.2" + }, + "jupyter": { + "hashes": [ + "sha256:3e1f86076bbb7c8c207829390305a2b1fe836d471ed54be66a3b8c41e7f46cc7", + "sha256:5b290f93b98ffbc21c0c7e749f054b3267782166d72fa5e3ed1ed4eaf34a2b78", + "sha256:d9dc4b3318f310e34c82951ea5d6683f67bed7def4b259fafbfe4f1beb1d8e5f" + ], + "index": "pypi", + "version": "==1.0.0" + }, + "jupyter-client": { + "hashes": [ + "sha256:73a809a2964afa07adcc1521537fddb58c2ffbb7e84d53dc5901cf80480465b3", + "sha256:98e8af5edff5d24e4d31e73bc21043130ae9d955a91aa93fc0bc3b1d0f7b5880" + ], + "version": "==5.3.1" + }, + "jupyter-console": { + "hashes": [ + "sha256:308ce876354924fb6c540b41d5d6d08acfc946984bf0c97777c1ddcb42e0b2f5", + "sha256:cc80a97a5c389cbd30252ffb5ce7cefd4b66bde98219edd16bf5cb6f84bb3568" + ], + "version": "==6.0.0" + }, + "jupyter-core": { + "hashes": [ + "sha256:2c6e7c1e9f2ac45b5c2ceea5730bc9008d92fe59d0725eac57b04c0edfba24f7", + "sha256:f4fa22d6cf25f34807c995f22d2923693575c70f02557bcbfbe59bd5ec8d8b84" + ], + "version": "==4.5.0" + }, + "markupsafe": { + "hashes": [ + "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", + "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", + "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", + "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", + "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", + "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", + "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", + "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", + "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", + "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", + "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", + "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905", + "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735", + "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d", + "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e", + "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d", + "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c", + "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21", + "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2", + "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5", + "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b", + "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", + "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", + "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7" + ], + "version": "==1.1.1" + }, + "mistune": { + "hashes": [ + "sha256:59a3429db53c50b5c6bcc8a07f8848cb00d7dc8bdb431a4ab41920d201d4756e", + "sha256:88a1051873018da288eee8538d476dffe1262495144b33ecb586c4ab266bb8d4" + ], + "version": "==0.8.4" + }, + "nbconvert": { + "hashes": [ + "sha256:427a468ec26e7d68a529b95f578d5cbf018cb4c1f889e897681c2b6d11897695", + "sha256:48d3c342057a2cf21e8df820d49ff27ab9f25fc72b8f15606bd47967333b2709" + ], + "version": "==5.6.0" + }, + "nbexec": { + "hashes": [ + "sha256:e367bac4a5c7cbd12e5b73b5e1011a4227b11d9e8e5a3811d8b8b9987eb235d0" + ], + "index": "pypi", + "version": "==0.0.2" + }, + "nbformat": { + "hashes": [ + "sha256:b9a0dbdbd45bb034f4f8893cafd6f652ea08c8c1674ba83f2dc55d3955743b0b", + "sha256:f7494ef0df60766b7cabe0a3651556345a963b74dbc16bc7c18479041170d402" + ], + "version": "==4.4.0" + }, + "notebook": { + "hashes": [ + "sha256:660976fe4fe45c7aa55e04bf4bccb9f9566749ff637e9020af3422f9921f9a5d", + "sha256:b0a290f5cc7792d50a21bec62b3c221dd820bf00efa916ce9aeec4b5354bde20" + ], + "version": "==6.0.1" + }, + "numpy": { + "hashes": [ + "sha256:05dbfe72684cc14b92568de1bc1f41e5f62b00f714afc9adee42f6311738091f", + "sha256:0d82cb7271a577529d07bbb05cb58675f2deb09772175fab96dc8de025d8ac05", + "sha256:10132aa1fef99adc85a905d82e8497a580f83739837d7cbd234649f2e9b9dc58", + "sha256:12322df2e21f033a60c80319c25011194cd2a21294cc66fee0908aeae2c27832", + "sha256:16f19b3aa775dddc9814e02a46b8e6ae6a54ed8cf143962b4e53f0471dbd7b16", + "sha256:3d0b0989dd2d066db006158de7220802899a1e5c8cf622abe2d0bd158fd01c2c", + "sha256:438a3f0e7b681642898fd7993d38e2bf140a2d1eafaf3e89bb626db7f50db355", + "sha256:5fd214f482ab53f2cea57414c5fb3e58895b17df6e6f5bca5be6a0bb6aea23bb", + "sha256:73615d3edc84dd7c4aeb212fa3748fb83217e00d201875a47327f55363cef2df", + "sha256:7bd355ad7496f4ce1d235e9814ec81ee3d28308d591c067ce92e49f745ba2c2f", + "sha256:7d077f2976b8f3de08a0dcf5d72083f4af5411e8fddacd662aae27baa2601196", + "sha256:a4092682778dc48093e8bda8d26ee8360153e2047826f95a3f5eae09f0ae3abf", + "sha256:b458de8624c9f6034af492372eb2fee41a8e605f03f4732f43fc099e227858b2", + "sha256:e70fc8ff03a961f13363c2c95ef8285e0cf6a720f8271836f852cc0fa64e97c8", + "sha256:ee8e9d7cad5fe6dde50ede0d2e978d81eafeaa6233fb0b8719f60214cf226578", + "sha256:f4a4f6aba148858a5a5d546a99280f71f5ee6ec8182a7d195af1a914195b21a2" + ], + "version": "==1.17.2" + }, + "pandas": { + "hashes": [ + "sha256:18d91a9199d1dfaa01ad645f7540370ba630bdcef09daaf9edf45b4b1bca0232", + "sha256:3f26e5da310a0c0b83ea50da1fd397de2640b02b424aa69be7e0784228f656c9", + "sha256:4182e32f4456d2c64619e97c58571fa5ca0993d1e8c2d9ca44916185e1726e15", + "sha256:426e590e2eb0e60f765271d668a30cf38b582eaae5ec9b31229c8c3c10c5bc21", + "sha256:5eb934a8f0dc358f0e0cdf314072286bbac74e4c124b64371395e94644d5d919", + "sha256:717928808043d3ea55b9bcde636d4a52d2236c246f6df464163a66ff59980ad8", + "sha256:8145f97c5ed71827a6ec98ceaef35afed1377e2d19c4078f324d209ff253ecb5", + "sha256:8744c84c914dcc59cbbb2943b32b7664df1039d99e834e1034a3372acb89ea4d", + "sha256:c1ac1d9590d0c9314ebf01591bd40d4c03d710bfc84a3889e5263c97d7891dee", + "sha256:cb2e197b7b0687becb026b84d3c242482f20cbb29a9981e43604eb67576da9f6", + "sha256:d4001b71ad2c9b84ff18b182cea22b7b6cbf624216da3ea06fb7af28d1f93165", + "sha256:d8930772adccb2882989ab1493fa74bd87d47c8ac7417f5dd3dd834ba8c24dc9", + "sha256:dfbb0173ee2399bc4ed3caf2d236e5c0092f948aafd0a15fbe4a0e77ee61a958", + "sha256:eebfbba048f4fa8ac711b22c78516e16ff8117d05a580e7eeef6b0c2be554c18", + "sha256:f1b21bc5cf3dbea53d33615d1ead892dfdae9d7052fa8898083bec88be20dcd2" + ], + "index": "pypi", + "version": "==0.25.1" + }, + "pandocfilters": { + "hashes": [ + "sha256:b3dd70e169bb5449e6bc6ff96aea89c5eea8c5f6ab5e207fc2f521a2cf4a0da9" + ], + "version": "==1.4.2" + }, + "parso": { + "hashes": [ + "sha256:63854233e1fadb5da97f2744b6b24346d2750b85965e7e399bec1620232797dc", + "sha256:666b0ee4a7a1220f65d367617f2cd3ffddff3e205f3f16a0284df30e774c2a9c" + ], + "version": "==0.5.1" + }, + "pexpect": { + "hashes": [ + "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1", + "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb" + ], + "markers": "sys_platform != 'win32'", + "version": "==4.7.0" + }, + "pickleshare": { + "hashes": [ + "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca", + "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56" + ], + "version": "==0.7.5" + }, + "prometheus-client": { + "hashes": [ + "sha256:71cd24a2b3eb335cb800c7159f423df1bd4dcd5171b234be15e3f31ec9f622da" + ], + "version": "==0.7.1" + }, + "prompt-toolkit": { + "hashes": [ + "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780", + "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1", + "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55" + ], + "version": "==2.0.9" + }, + "ptyprocess": { + "hashes": [ + "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0", + "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f" + ], + "markers": "os_name != 'nt'", + "version": "==0.6.0" + }, + "pygments": { + "hashes": [ + "sha256:71e430bc85c88a430f000ac1d9b331d2407f681d6f6aec95e8bcfbc3df5b0127", + "sha256:881c4c157e45f30af185c1ffe8d549d48ac9127433f2c380c24b84572ad66297" + ], + "version": "==2.4.2" + }, + "pyrsistent": { + "hashes": [ + "sha256:34b47fa169d6006b32e99d4b3c4031f155e6e68ebcc107d6454852e8e0ee6533" + ], + "version": "==0.15.4" + }, + "python-dateutil": { + "hashes": [ + "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", + "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e" + ], + "version": "==2.8.0" + }, + "pytz": { + "hashes": [ + "sha256:26c0b32e437e54a18161324a2fca3c4b9846b74a8dccddd843113109e1116b32", + "sha256:c894d57500a4cd2d5c71114aaab77dbab5eabd9022308ce5ac9bb93a60a6f0c7" + ], + "version": "==2019.2" + }, + "pyzmq": { + "hashes": [ + "sha256:01636e95a88d60118479041c6aaaaf5419c6485b7b1d37c9c4dd424b7b9f1121", + "sha256:021dba0d1436516092c624359e5da51472b11ba8edffa334218912f7e8b65467", + "sha256:0463bd941b6aead494d4035f7eebd70035293dd6caf8425993e85ad41de13fa3", + "sha256:05fd51edd81eed798fccafdd49c936b6c166ffae7b32482e4d6d6a2e196af4e6", + "sha256:1fadc8fbdf3d22753c36d4172169d184ee6654f8d6539e7af25029643363c490", + "sha256:22efa0596cf245a78a99060fe5682c4cd00c58bb7614271129215c889062db80", + "sha256:260c70b7c018905ec3659d0f04db735ac830fe27236e43b9dc0532cf7c9873ef", + "sha256:2762c45e289732d4450406cedca35a9d4d71e449131ba2f491e0bf473e3d2ff2", + "sha256:2fc6cada8dc53521c1189596f1898d45c5f68603194d3a6453d6db4b27f4e12e", + "sha256:343b9710a61f2b167673bea1974e70b5dccfe64b5ed10626798f08c1f7227e72", + "sha256:41bf96d5f554598a0632c3ec28e3026f1d6591a50f580df38eff0b8067efb9e7", + "sha256:856b2cdf7a1e2cbb84928e1e8db0ea4018709b39804103d3a409e5584f553f57", + "sha256:85b869abc894672de9aecdf032158ea8ad01e2f0c3b09ef60e3687fb79418096", + "sha256:93f44739db69234c013a16990e43db1aa0af3cf5a4b8b377d028ff24515fbeb3", + "sha256:98fa3e75ccb22c0dc99654e3dd9ff693b956861459e8c8e8734dd6247b89eb29", + "sha256:9a22c94d2e93af8bebd4fcf5fa38830f5e3b1ff0d4424e2912b07651eb1bafb4", + "sha256:a7d3f4b4bbb5d7866ae727763268b5c15797cbd7b63ea17f3b0ec1067da8994b", + "sha256:b645a49376547b3816433a7e2d2a99135c8e651e50497e7ecac3bd126e4bea16", + "sha256:cf0765822e78cf9e45451647a346d443f66792aba906bc340f4e0ac7870c169c", + "sha256:dc398e1e047efb18bfab7a8989346c6921a847feae2cad69fedf6ca12fb99e2c", + "sha256:dd5995ae2e80044e33b5077fb4bc2b0c1788ac6feaf15a6b87a00c14b4bdd682", + "sha256:e03fe5e07e70f245dc9013a9d48ae8cc4b10c33a1968039c5a3b64b5d01d083d", + "sha256:ea09a306144dff2795e48439883349819bef2c53c0ee62a3c2fae429451843bb", + "sha256:f4e37f33da282c3c319849877e34f97f0a3acec09622ec61b7333205bdd13b52", + "sha256:fa4bad0d1d173dee3e8ef3c3eb6b2bb6c723fc7a661eeecc1ecb2fa99860dd45" + ], + "version": "==18.1.0" + }, + "qtconsole": { + "hashes": [ + "sha256:40d5d8e00d070ea266dbf6f0da74c4b9597b8b8d67cd8233c3ffd8debf923703", + "sha256:b91e7412587e6cfe1644696538f73baf5611e837be5406633218443b2827c6d9" + ], + "version": "==4.5.5" + }, + "requests": { + "hashes": [ + "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", + "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + ], + "index": "pypi", + "version": "==2.22.0" + }, + "requests-cache": { + "hashes": [ + "sha256:813023269686045f8e01e2289cc1e7e9ae5ab22ddd1e2849a9093ab3ab7270eb", + "sha256:81e13559baee64677a7d73b85498a5a8f0639e204517b5d05ff378e44a57831a" + ], + "index": "pypi", + "version": "==0.5.2" + }, + "send2trash": { + "hashes": [ + "sha256:60001cc07d707fe247c94f74ca6ac0d3255aabcb930529690897ca2a39db28b2", + "sha256:f1691922577b6fa12821234aeb57599d887c4900b9ca537948d2dac34aea888b" + ], + "version": "==1.5.0" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + }, + "terminado": { + "hashes": [ + "sha256:d9d012de63acb8223ac969c17c3043337c2fcfd28f3aea1ee429b345d01ef460", + "sha256:de08e141f83c3a0798b050ecb097ab6259c3f0331b2f7b7750c9075ced2c20c2" + ], + "version": "==0.8.2" + }, + "testpath": { + "hashes": [ + "sha256:46c89ebb683f473ffe2aab0ed9f12581d4d078308a3cb3765d79c6b2317b0109", + "sha256:b694b3d9288dbd81685c5d2e7140b81365d46c29f5db4bc659de5aa6b98780f8" + ], + "version": "==0.4.2" + }, + "tornado": { + "hashes": [ + "sha256:349884248c36801afa19e342a77cc4458caca694b0eda633f5878e458a44cb2c", + "sha256:398e0d35e086ba38a0427c3b37f4337327231942e731edaa6e9fd1865bbd6f60", + "sha256:4e73ef678b1a859f0cb29e1d895526a20ea64b5ffd510a2307b5998c7df24281", + "sha256:559bce3d31484b665259f50cd94c5c28b961b09315ccd838f284687245f416e5", + "sha256:abbe53a39734ef4aba061fca54e30c6b4639d3e1f59653f0da37a0003de148c7", + "sha256:c845db36ba616912074c5b1ee897f8e0124df269468f25e4fe21fe72f6edd7a9", + "sha256:c9399267c926a4e7c418baa5cbe91c7d1cf362d505a1ef898fde44a07c9dd8a5" + ], + "version": "==6.0.3" + }, + "tqdm": { + "hashes": [ + "sha256:1be3e4e3198f2d0e47b928e9d9a8ec1b63525db29095cec1467f4c5a4ea8ebf9", + "sha256:7e39a30e3d34a7a6539378e39d7490326253b7ee354878a92255656dc4284457" + ], + "index": "pypi", + "version": "==4.35.0" + }, + "traitlets": { + "hashes": [ + "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835", + "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9" + ], + "version": "==4.3.2" + }, + "urllib3": { + "hashes": [ + "sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1", + "sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232" + ], + "version": "==1.25.3" + }, + "wcwidth": { + "hashes": [ + "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", + "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + ], + "version": "==0.1.7" + }, + "webencodings": { + "hashes": [ + "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", + "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923" + ], + "version": "==0.5.1" + }, + "widgetsnbextension": { + "hashes": [ + "sha256:079f87d87270bce047512400efd70238820751a11d2d8cb137a5a5bdbaf255c7", + "sha256:bd314f8ceb488571a5ffea6cc5b9fc6cba0adaf88a9d2386b93a489751938bcd" + ], + "version": "==3.5.1" + } + }, + "develop": {} +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..d59aabe --- /dev/null +++ b/README.md @@ -0,0 +1,117 @@ +# Analysis of comments submitted to three FCC public dockets + +This repository contains data, code, and methodology supporting [BuzzFeed News' analysis of comments submitted to three Federal Communications Commission (FCC) dockets](https://www.buzzfeednews.com/article/jsvine/net-neutrality-fcc-fake-comments-impersonation), published October 3, 2019: + +- 17-108 ("Restoring Internet Freedom") +- 16-42 ("Expanding Consumers' Video Navigation Choices") +- 14-28 ("Protecting and Promoting the Open Internet") + +Please see below for further details. + +## Data Sources + +The data in this repository comes from several sources: + +### The FCC's Electronic Comment Filing System (ECFS) + +The ECFS is the FCC's public portal for searching and accessing comments submitted to the commission's dockets. BuzzFeed News used the website to download each individually-listed comment, for two of the dockets: [14-28](https://www.fcc.gov/ecfs/search/filings?date_disseminated=%5Bgte%5D2014-02-21%5Blte%5D2016-01-01&proceedings_name=14-28&sort=date_disseminated,ASC_description=COMMENT) and [16-42](https://www.fcc.gov/ecfs/search/filings?date_disseminated=%5Bgte%5D2016-02-23%5Blte%5D2018-10-01&proceedings_name=16-42&sort=date_disseminated,ASC&submissiontype_description=COMMENT). __Note__: Not all comments submitted to the FCC are individually listed; in some cases, an organization will submit a consolidated set of comments as a PDF, with signatures and/or commenters' information listed in that PDF. Because of the extraordinary variety and inconsistency of those files, BuzzFeed News did not disaggregate those comments. + +### The FCC's bulk download of Docket 17-108 comments + +On November 7, 2017, [the FCC released](https://ecfsapi.fcc.gov/file/11073095518421/DA-17-1089A1_Rcd.pdf) a "complete set of [Docket 17-108] filings submitted as of November 3, 2017"; BuzzFeed News used this download to examine docket-wide trends. + +### Bulk uploads to Docket 17-108, via FOIA + +In response to two FOIA requests, the FCC provided to BuzzFeed News the files submitted to the agency's [bulk-upload system for Docket 17-108](https://www.fcc.gov/restoring-internet-freedom-comments-wc-docket-no-17-108), plus associated metadata indicating the uploader's Box.com account and the time of the upload. According to the FCC, it provided all such files submitted. Although the agency provided a template for the uploads, some of the files — typically the smallest ones, containing just one comment each — do not conform to them and could not be incorporated easily. Those comments, which represent an exceedingly small percentage of all bulk-uploaded comments, have not been included in this repository's data; in many cases, the corresponding comments appear also not to have been added to the FCC's public comment portal. In certain other cases, the upload files use non-standard column names. In cases where the intention appeared to be clear, BuzzFeed News fixed the column names and included the data. + +### haveibeenpwned.com + +[Have I Been Pwned](https://haveibeenpwned.com/) is a website and service that identifies whether any given email address has been exposed in any of hundreds of major data breaches. BuzzFeed News used [HIBP's application programming interface](https://haveibeenpwned.com/API/v3) to determine the most common breaches associated with various groups of email addresses. + +## Personal Information Minimization + +Because it appears that many of the comments in the data above were submitted without the consent of the named commenters, we have taken the following steps: + +- Removing all raw personal-information columns (name, physical address, etc.). + +- Replacing each distinct email address with a randomly-assigned unique identifier. (Specifically, a [version 4 UUID](https://www.cryptosys.net/pki/uuid-rfc4122.html).) + +- Replacing each distinct email domain with a similar randomly-assigned unique identifier, except for very common domains. (Specifically the 36 domains that are associated with 10,000 or more unique email addresses in the Docket 17-108 comments.) + +- Replacing each distinct combination of name + location (first line of street address, city, state, ZIP code) with another UUID. Before converting to UUIDs, ZIP codes are converted to zero-padded five-digit representations, and all strings are lowercased. For instance: `John Doe, 123 Smith Street, New York, NY 01111` will receive the same UUID as `john doe, 123 SMITH STREET, New York, ny 1111`, but neither will match submissions that put him at `123 Smith St.` (with the abbreviation). + +## Data Files + +The process above produces the files listed below. Several are too large to host on GitHub, so BuzzFeed News has [uploaded them here](https://archive.org/details/fcc-comments-and-bulk-uploads). + +### Comment data + +These files contain selected fields from the comment data listed above: + +- `bulk-uploads-17-108-with-uuids.csv`: Docket 17-108 bulk uploads, via FOIA +- `comments-17-108-with-uuids.csv`: Docket 17-108, via FCC official download +- `comments-14-28-with-uuids.csv`: Docket 14-28, via FCC online portal +- `comments-16-42-with-uuids.csv`: Docket 16-42, via FCC online portal + +They contain the following columns: + +- `date`: The date of submission. +- `id_submission`: The ID the FCC has assigned to the comment. __Note__: Not available in `bulk-uploads-17-108-with-uuids.csv`, because the FCC assigns the IDs *after* they are uploaded. +- `comments`: The text of the comment. __Note__: This is sometimes modified by the FCC, for example by adding a filename or, as appears to be the case for some Docket 14-28 comments, removing boilerplate language.) __Note__: Not included in `comments-17-108-with-uuids.csv` for file-size considerations, because this file is mainly used for domain-counts. +- `name_and_location`: The UUID (see above) corresponding to the name and adress information provided with the comment. __Note__: Not included in `comments-17-108-with-uuids.csv`. +- `email_address`: The UUID (see above) corresponding to the email address provided with the comment. __Note__: In the FCC's commenting system, you don't have to control an email address to list it as the author of a comment. +- `email_address_nonstandard`: If the email address contains nonstandard characters (such as `%`) or formatting (such as lacking an `@` symbol), this value will be `1`; otherwise, it will be `0`. This is used to filter out likely-invalid addresses before checking them on Have I Been Pwned. +- `email_domain`: The domain of the email address, as a UUID unless it is one of the 36 domains described above. + +Additionally, `bulk-uploads-17-108-with-uuids.csv` contains the following columns: + +- `file`: The name of the file in which the comment was uploaded. +- `uploader`: The email address associated with the Box.com account that uploaded the file. + +### Breach data + +These files list the breaches, per Have I Been Pwned, for email addresses in a randomized samples of the comments bulk-uplaoded to Docket 17-108: + +- `breaches-17-108-bulk-uploads-sample.csv`: 1,000-address sample of each of the eight bulk-uploaders whose Docket 17-108 uploads contained at least 10,000 unique email addresses. +- `breaches-17-108-mb-sample.csv`: 10,000-address sample of Media Bridge's Docket 17-108 bulk-uploads. + + +They contain the following columns: + +- `email_address`: The UUID (see above) corresponding to the email address examined. +- `breach`: The name of the breach, [as returned by Have I Been Pwned](https://haveibeenpwned.com/API/v3). + +## Analysis + +The [`analyze-fcc-comments` notebook](notebooks/analyze-fcc-comments.ipynb) examines comments submitted to the three FCC dockets described above, the language used in them, the timing of their submission. For Docket 17-108, the notebook also examines the email domains associated with the comments, as well as rates at which the email addresses in the bulk uploads overlap with those exposed in major data breaches. The notebook also examines the overlap between the contact information in Docket 16-42 and Docket 17-108. + +The [`analyze-mb-comment-structure` notebook](notebooks/analyze-mb-comment-structure.ipynb) examines the phrasing of the comments that Media Bridge submitted to Docket 17-108, and attempts to reverse-engineer the comments that use randomly-generated text. + +## Reproducibility + +The code running the analysis is written in Python 3, and requires the following Python libraries: + +- [jupyter](https://jupyter.org/) to run the notebook infrastructure +- [pandas](https://pandas.pydata.org/) for data loading and analysis + +If you would like to reuse the code for fetching data from Have I Been Pwned's API, you will also need these Python libraries: + +- [requests](https://2.python-requests.org/en/master/) for HTTP requests +- [requests-cache](https://requests-cache.readthedocs.io/en/latest/) for caching HTTP requests +- [tqdm](https://tqdm.github.io) for progress bars + +If you use Pipenv, you can install all required libraries with `pipenv install`. + +As noted above, you will need to download the source data separately. Save the folder as this repository's `/data` directory. + +Execute the notebooks in the `notebooks/` directory to reproduce the findings. + +## Licensing + +All code in this repository is available under the [MIT License](https://opensource.org/licenses/MIT). + +## Questions / Feedback + +Contact Jeremy Singer-Vine at [jeremy.singer-vine@buzzfeed.com](mailto:jeremy.singer-vine@buzzfeed.com). + +Looking for more from BuzzFeed News? [Click here for a list of our open-sourced projects, data, and code.](https://github.com/BuzzFeedNews/everything) diff --git a/data/.keep b/data/.keep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/analyze-fcc-comments.ipynb b/notebooks/analyze-fcc-comments.ipynb new file mode 100644 index 0000000..b51f557 --- /dev/null +++ b/notebooks/analyze-fcc-comments.ipynb @@ -0,0 +1,5491 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analysis of comments in three FCC dockets\n", + "\n", + "This notebook contains Python code that runs the following steps:\n", + "\n", + "- Loading the four comment datasets under analysis (published comments for FCC dockets 14-28, 16-42, 17-108, plus bulk-uploaded comments for docket 17-108).\n", + "\n", + "\n", + "- Classifying the comments for dockets 14-28 and 16-42, based on the language used in them.\n", + "\n", + "\n", + "- Examining:\n", + "\n", + " - How often email addresses the 17-108 bulk uploads appear in data breaches identified by [Have I Been Pwned](https://haveibeenpwned.com/)\n", + "\n", + " - The overlap between comments in docket 16-42 and bulk-uploaded comments in docket 17-108\n", + "\n", + " - The comments attributed to Annie Reeves vis-a-vis the timing and language used in American Commitment's docket 14-28 and docket 16-42 mass-comment campaigns.\n", + " \n", + "__Please see this repository's landing page and associated BuzzFeed News article (linked on the landing page) for context before continuing.__" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import Python libraries and set key variables" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Standard libraries\n", + "import os\n", + "import sys\n", + "import time\n", + "import re\n", + "\n", + "# External libraries\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Change this to True if you plan to reuse this notebook\n", + "# and want to make HTTP requests to Have I Been Pwned's API\n", + "\n", + "MAKE_HTTP_REQUESTS = False\n", + "\n", + "if MAKE_HTTP_REQUESTS:\n", + " from tqdm.auto import tqdm\n", + " import requests\n", + " import requests_cache\n", + " \n", + " # This is the API key for Have I Been Pwned\n", + " HIBP_KEY = open(\"../hibp-key.txt\").read().strip()\n", + " \n", + " # Enables graphical progress bars when fetching HIPB data\n", + " tqdm.pandas()\n", + " \n", + " # For caching HTTP requests\n", + " requests_cache.install_cache(\n", + " \"../hibp-requests-cache\",\n", + " allowable_codes = (200, 404),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_PATH = \"../data/\"\n", + "\n", + "# In the sampling procedures below, we use this \"random state\"\n", + "# to make the samples reproducible. \n", + "RANDOM_STATE = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load comments" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def load_comments(path, **kwargs):\n", + " return (\n", + " pd.read_csv(\n", + " path,\n", + " dtype = str,\n", + " **kwargs\n", + " )\n", + " .astype({\n", + " \"email_address_nonstandard\": int\n", + " })\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Docket 17-108, bulk uploads" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datecommentsfileuploaderemail_address_nonstandardemail_addressemail_domainname_and_location
05/8/2017Dear FCC, I am am writing today to SUPPORT net...ecfs-input-template-17-108 (209).csvkathleenkintz@gmail.com09f664e24-96aa-4d96-b453-24d926658b47gmail.com5100f64f-b025-467f-9aa6-0100fa615ae6
112/31/2017Dear FCC, I am writing you today because I spe...ecfs-input-template-17-108 (120).csvvgboy522@gmail.com0818761bf-4c51-4970-95e6-11b01bac631fgmail.comdda3bd6b-9ad2-42d0-af15-f12c0b8a9354
25/16/17Obama's Federal Communications Commission (FCC...TPA_3911_2017526.csvesmisc@mac.com0f2cf802f-0c01-4d1f-b28f-0efef2a053bahotmail.comd9b96c36-796e-45d1-97d8-00647ae09d89
35/16/17Obama's Federal Communications Commission (FCC...TPA_3911_2017526.csvesmisc@mac.com06966ae39-6da6-4a47-a1ec-7dc854030634gmail.comf6d75f39-e952-41ff-b7a9-3d86da811496
45/16/17Obama's Federal Communications Commission (FCC...TPA_3911_2017526.csvesmisc@mac.com0610afa24-f0df-44ff-b621-f545d371efabgmail.com1b3050d5-6f3a-495e-a67f-b3b61040fe02
\n", + "
" + ], + "text/plain": [ + " date comments \\\n", + "0 5/8/2017 Dear FCC, I am am writing today to SUPPORT net... \n", + "1 12/31/2017 Dear FCC, I am writing you today because I spe... \n", + "2 5/16/17 Obama's Federal Communications Commission (FCC... \n", + "3 5/16/17 Obama's Federal Communications Commission (FCC... \n", + "4 5/16/17 Obama's Federal Communications Commission (FCC... \n", + "\n", + " file uploader \\\n", + "0 ecfs-input-template-17-108 (209).csv kathleenkintz@gmail.com \n", + "1 ecfs-input-template-17-108 (120).csv vgboy522@gmail.com \n", + "2 TPA_3911_2017526.csv esmisc@mac.com \n", + "3 TPA_3911_2017526.csv esmisc@mac.com \n", + "4 TPA_3911_2017526.csv esmisc@mac.com \n", + "\n", + " email_address_nonstandard email_address \\\n", + "0 0 9f664e24-96aa-4d96-b453-24d926658b47 \n", + "1 0 818761bf-4c51-4970-95e6-11b01bac631f \n", + "2 0 f2cf802f-0c01-4d1f-b28f-0efef2a053ba \n", + "3 0 6966ae39-6da6-4a47-a1ec-7dc854030634 \n", + "4 0 610afa24-f0df-44ff-b621-f545d371efab \n", + "\n", + " email_domain name_and_location \n", + "0 gmail.com 5100f64f-b025-467f-9aa6-0100fa615ae6 \n", + "1 gmail.com dda3bd6b-9ad2-42d0-af15-f12c0b8a9354 \n", + "2 hotmail.com d9b96c36-796e-45d1-97d8-00647ae09d89 \n", + "3 gmail.com f6d75f39-e952-41ff-b7a9-3d86da811496 \n", + "4 gmail.com 1b3050d5-6f3a-495e-a67f-b3b61040fe02 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bulk_uploads_17_108 = load_comments(BASE_PATH + \"bulk-uploads-17-108-with-uuids.csv\")\n", + "\n", + "bulk_uploads_17_108.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Examine bulk-uploader metrics for 17-108" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
submissionsunique_emailsprop_with_email
uploader
esmisc@mac.com434797939660161.0000
shane@mediabridgellc.com185655315011451.0000
mike@fightforthefuture.org14644231296820.2464
karen@momsrising.org1069368178700.0362
dutch@freepress.net52860730.0000
kurt@demandprogress.org4127922903721.0000
fccfreedom@hmamail.com2070071222521.0000
advocacy@mozilla.com8292600.0000
action@aclu.org4873300.0000
meaghan@mandatemedia.com17317162671.0000
ncatalano@ofa.us12230122301.0000
\n", + "
" + ], + "text/plain": [ + " submissions unique_emails prop_with_email\n", + "uploader \n", + "esmisc@mac.com 4347979 3966016 1.0000\n", + "shane@mediabridgellc.com 1856553 1501145 1.0000\n", + "mike@fightforthefuture.org 1464423 129682 0.2464\n", + "karen@momsrising.org 1069368 17870 0.0362\n", + "dutch@freepress.net 528607 3 0.0000\n", + "kurt@demandprogress.org 412792 290372 1.0000\n", + "fccfreedom@hmamail.com 207007 122252 1.0000\n", + "advocacy@mozilla.com 82926 0 0.0000\n", + "action@aclu.org 48733 0 0.0000\n", + "meaghan@mandatemedia.com 17317 16267 1.0000\n", + "ncatalano@ofa.us 12230 12230 1.0000" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "uploader_metrics = (\n", + " bulk_uploads_17_108\n", + " .assign(\n", + " prop_with_email = lambda df: df[\"email_address\"].notnull()\n", + " )\n", + " .groupby(\"uploader\")\n", + " .pipe(lambda grp: pd.DataFrame({\n", + " \"submissions\": grp.size(),\n", + " \"unique_emails\": grp[\"email_address\"].nunique(), \n", + " \"prop_with_email\": grp[\"prop_with_email\"].mean().round(4),\n", + " }))\n", + ")\n", + " \n", + "(\n", + " uploader_metrics\n", + " .sort_values(\"submissions\", ascending = False)\n", + " .loc[lambda df: df[\"submissions\"] >= 10000]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Docket 17-108, all comments" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_submissiondateemail_address_nonstandardemail_addressemail_domain
0042729726191492017-04-270NaNNaN
104275479249542017-04-270NaNNaN
2104279181179872017-04-2701f4830aa-726c-4206-9bef-cb3f2a57bb20gmail.com
3104270805306672017-04-270f10d9c2b-2c98-44c2-9c7a-fe57b96930d8gmail.com
410427091100342017-04-270a6609a29-4b4c-4857-9e42-a886f61b8aaad6b158e4-d116-4944-ab88-73091f1fc465
\n", + "
" + ], + "text/plain": [ + " id_submission date email_address_nonstandard \\\n", + "0 04272972619149 2017-04-27 0 \n", + "1 0427547924954 2017-04-27 0 \n", + "2 10427918117987 2017-04-27 0 \n", + "3 10427080530667 2017-04-27 0 \n", + "4 1042709110034 2017-04-27 0 \n", + "\n", + " email_address email_domain \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 1f4830aa-726c-4206-9bef-cb3f2a57bb20 gmail.com \n", + "3 f10d9c2b-2c98-44c2-9c7a-fe57b96930d8 gmail.com \n", + "4 a6609a29-4b4c-4857-9e42-a886f61b8aaa d6b158e4-d116-4944-ab88-73091f1fc465 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comments_17_108 = (\n", + " load_comments(BASE_PATH + \"comments-17-108-with-uuids.csv\")\n", + " .assign(date = lambda df: df[\"date\"].str.slice(0, 10))\n", + ")\n", + "\n", + "comments_17_108.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Examine email domains attributed to these comments" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countunique_addresses
email_domain
gmail.com50006874160788
yahoo.com25368922126544
hotmail.com673018571156
aol.com632971508087
pornhub.com1030003233516
comcast.net208512158939
icloud.com10644291091
msn.com11005689398
hurra.de36335788571
outlook.com7941167890
att.net7982360640
live.com7013959210
sbcglobal.net7012651206
yahoo.fr9338948034
ymail.com4503637515
bellsouth.net4043232155
cox.net4013731260
verizon.net4193329236
yahoo.de9797728310
mail.ru6060824570
me.com2600019559
charter.net2442518487
einrot.com79314817091
gustr.com76901016813
rhyta.com77375716756
jourrapide.com78265016746
armyspy.com78066416741
dayrep.com77002316733
superrito.com76749516684
teleworm.us76548816673
cuvox.de77590416623
fleckens.hu77609216600
mail.com1639214657
rocketmail.com1711214266
windstream.net1349611107
earthlink.net1806811088
\n", + "
" + ], + "text/plain": [ + " count unique_addresses\n", + "email_domain \n", + "gmail.com 5000687 4160788\n", + "yahoo.com 2536892 2126544\n", + "hotmail.com 673018 571156\n", + "aol.com 632971 508087\n", + "pornhub.com 1030003 233516\n", + "comcast.net 208512 158939\n", + "icloud.com 106442 91091\n", + "msn.com 110056 89398\n", + "hurra.de 363357 88571\n", + "outlook.com 79411 67890\n", + "att.net 79823 60640\n", + "live.com 70139 59210\n", + "sbcglobal.net 70126 51206\n", + "yahoo.fr 93389 48034\n", + "ymail.com 45036 37515\n", + "bellsouth.net 40432 32155\n", + "cox.net 40137 31260\n", + "verizon.net 41933 29236\n", + "yahoo.de 97977 28310\n", + "mail.ru 60608 24570\n", + "me.com 26000 19559\n", + "charter.net 24425 18487\n", + "einrot.com 793148 17091\n", + "gustr.com 769010 16813\n", + "rhyta.com 773757 16756\n", + "jourrapide.com 782650 16746\n", + "armyspy.com 780664 16741\n", + "dayrep.com 770023 16733\n", + "superrito.com 767495 16684\n", + "teleworm.us 765488 16673\n", + "cuvox.de 775904 16623\n", + "fleckens.hu 776092 16600\n", + "mail.com 16392 14657\n", + "rocketmail.com 17112 14266\n", + "windstream.net 13496 11107\n", + "earthlink.net 18068 11088" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "email_domains_17_108 = (\n", + " comments_17_108\n", + " .groupby([ \"email_domain\" ])\n", + " .pipe(lambda grp: pd.DataFrame({\n", + " \"count\": grp.size(),\n", + " \"unique_addresses\": grp[\"email_address\"].nunique()\n", + " }))\n", + " .sort_values([ \"count\", \"unique_addresses\" ], ascending = False)\n", + ")\n", + "\n", + "(\n", + " email_domains_17_108\n", + " .loc[lambda df: df[\"unique_addresses\"] >= 10000]\n", + " .sort_values(\"unique_addresses\", ascending = False)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we count the comments and unique email addresses associated with FakeMailGenerator.com:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "FAKEMAIL_DOMAINS = [\n", + " \"einrot.com\",\n", + " \"jourrapide.com\",\n", + " \"armyspy.com\",\n", + " \"fleckens.hu\",\n", + " \"cuvox.de\",\n", + " \"rhyta.com\",\n", + " \"dayrep.com\",\n", + " \"gustr.com\",\n", + " \"superrito.com\",\n", + " \"teleworm.us\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count
count7754231
unique_addresses167460
\n", + "
" + ], + "text/plain": [ + " count\n", + "count 7754231\n", + "unique_addresses 167460" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " email_domains_17_108\n", + " .loc[FAKEMAIL_DOMAINS]\n", + " [[\"count\", \"unique_addresses\"]]\n", + " .sum()\n", + " .to_frame(\"count\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Docket 16-42" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dateid_submissioncommentsemail_address_nonstandardemail_addressemail_domainname_and_location
02016-02-196000148370260001515146.txtThank you! Very pleased to see...0NaNNaN8ad10c4e-1354-42ba-83f1-be6b3c89f331
12016-02-226000148431760001843102.txt[5/23/2016 7:55:30 PM]The excha...0310d8308-43a0-4b84-93dc-6662acdef829gmail.comd984ccab-11bb-4994-bcfe-f0d407fd03b5
22016-02-256000148687660001518518.txtPlease eliminate the cable TV b...07e6087df-a7a2-414f-8ebb-3be229805becyahoo.com298bb4d9-8130-4ced-86e9-6a5d0c740c66
32016-02-2760001489444I?support?the?FCC?allowing?homeowners?to?be?fr...0NaNNaN12614861-1a8f-4313-aeff-2366bcf18ca8
42016-02-296000149208360001523826.txtAs a consumer, I agree with the...0NaNNaN10268573-9386-42c7-ab31-4d76641e76ed
\n", + "
" + ], + "text/plain": [ + " date id_submission \\\n", + "0 2016-02-19 60001483702 \n", + "1 2016-02-22 60001484317 \n", + "2 2016-02-25 60001486876 \n", + "3 2016-02-27 60001489444 \n", + "4 2016-02-29 60001492083 \n", + "\n", + " comments \\\n", + "0 60001515146.txtThank you! Very pleased to see... \n", + "1 60001843102.txt[5/23/2016 7:55:30 PM]The excha... \n", + "2 60001518518.txtPlease eliminate the cable TV b... \n", + "3 I?support?the?FCC?allowing?homeowners?to?be?fr... \n", + "4 60001523826.txtAs a consumer, I agree with the... \n", + "\n", + " email_address_nonstandard email_address \\\n", + "0 0 NaN \n", + "1 0 310d8308-43a0-4b84-93dc-6662acdef829 \n", + "2 0 7e6087df-a7a2-414f-8ebb-3be229805bec \n", + "3 0 NaN \n", + "4 0 NaN \n", + "\n", + " email_domain name_and_location \n", + "0 NaN 8ad10c4e-1354-42ba-83f1-be6b3c89f331 \n", + "1 gmail.com d984ccab-11bb-4994-bcfe-f0d407fd03b5 \n", + "2 yahoo.com 298bb4d9-8130-4ced-86e9-6a5d0c740c66 \n", + "3 NaN 12614861-1a8f-4313-aeff-2366bcf18ca8 \n", + "4 NaN 10268573-9386-42c7-ab31-4d76641e76ed " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comments_16_42 = (\n", + " load_comments(BASE_PATH + \"comments-16-42-with-uuids.csv\")\n", + " .assign(date = lambda df: df[\"date\"].str.slice(0, 10))\n", + ") \n", + "\n", + "comments_16_42.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Docket 14-28" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dateid_submissioncommentsemail_address_nonstandardemail_addressemail_domainname_and_location
02014-02-2160175898537521074305.txt Reclassify The Internet As A Co...0NaNNaNa0fad65b-1482-427d-b300-da8e63d14272
12014-02-2160175898667521074318.txt Reclassify The Internet As A Co...0NaNNaN45954c7c-d52d-48f0-a252-343b4f82e509
22014-02-2160175899037521074355.txt Reclassify The Internet As A Co...0NaNNaN0a1dea8a-3ae6-434f-be03-4b6447c3190c
32014-02-2160175899047521074356.txt Reclassify The Internet As A Co...0NaNNaN5cb3b14f-71a4-4763-8bd1-40ad440c5eb8
42014-02-2160175899247521074376.txt Reclassify The Internet As A Co...0NaNNaN0de47b78-a256-4e36-93ad-1f4830b07c48
\n", + "
" + ], + "text/plain": [ + " date id_submission \\\n", + "0 2014-02-21 6017589853 \n", + "1 2014-02-21 6017589866 \n", + "2 2014-02-21 6017589903 \n", + "3 2014-02-21 6017589904 \n", + "4 2014-02-21 6017589924 \n", + "\n", + " comments \\\n", + "0 7521074305.txt Reclassify The Internet As A Co... \n", + "1 7521074318.txt Reclassify The Internet As A Co... \n", + "2 7521074355.txt Reclassify The Internet As A Co... \n", + "3 7521074356.txt Reclassify The Internet As A Co... \n", + "4 7521074376.txt Reclassify The Internet As A Co... \n", + "\n", + " email_address_nonstandard email_address email_domain \\\n", + "0 0 NaN NaN \n", + "1 0 NaN NaN \n", + "2 0 NaN NaN \n", + "3 0 NaN NaN \n", + "4 0 NaN NaN \n", + "\n", + " name_and_location \n", + "0 a0fad65b-1482-427d-b300-da8e63d14272 \n", + "1 45954c7c-d52d-48f0-a252-343b4f82e509 \n", + "2 0a1dea8a-3ae6-434f-be03-4b6447c3190c \n", + "3 5cb3b14f-71a4-4763-8bd1-40ad440c5eb8 \n", + "4 0de47b78-a256-4e36-93ad-1f4830b07c48 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comments_14_28 = (\n", + " load_comments(BASE_PATH + \"comments-14-28-with-uuids.csv\")\n", + " .assign(date = lambda df: df[\"date\"].str.slice(0, 10))\n", + ") \n", + "\n", + "comments_14_28.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Classify comments\n", + "\n", + "In this step, we create derivative dataframes that classify each comment based on the language used in them. (Note: Because the formatting of comments can be inconsistent, the classification approach ignores whitespace.)\n", + "\n", + "The classifier takes a series of texts and a series of patterns to look for. Each text is labeled based on the __first__ pattern it matches, based on the sequential order of the patterns; if the text matches no pattern, it is labeled `[other]`." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def classify(texts, patterns):\n", + " # Create a copy of the texts and remove whitespace\n", + " s = texts.copy().str.replace(r\"\\s+\", \"\")\n", + " \n", + " # Remove whitespace from classification patterns\n", + " without_whitespace = [ (re.sub(r\"\\s+\", \"\", pat), val)\n", + " for pat, val in patterns ]\n", + " \n", + " # An empty series, indexed identically to the original texts.\n", + " ix = pd.Series(None, index = texts.index)\n", + "\n", + " # As we progress through the matching, we will gradually\n", + " # fill `ix` in with the matches we've found.\n", + " \n", + " # Iterate through the classification patterns\n", + " for pat, val in without_whitespace:\n", + " # Determine which texts match\n", + " search_result = s.str.contains(pat, na = False)\n", + " matches = search_result.loc[lambda x: x == True]\n", + " \n", + " # For matches, update `ix` to indicate the pattern ID/description\n", + " ix.loc[matches.index] = val\n", + " \n", + " # Subset `s` so that it only contains unmatched texts\n", + " s = s.loc[s.index.difference(matches.index)]\n", + "\n", + " return ix.fillna(\"[other]\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def add_classification(df, patterns):\n", + " return (\n", + " df\n", + " .assign(group = lambda df: (\n", + " df[\"comments\"]\n", + " .pipe(classify, patterns)\n", + " ))\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def print_example_comments(df, n = 3, max_chars = 500):\n", + " for grp, subdf in df.groupby(\"group\"):\n", + " print(f\"=== {grp} ===\\n\")\n", + " \n", + " examples = (\n", + " subdf[\"comments\"]\n", + " .sample(n, random_state = RANDOM_STATE)\n", + " .pipe(lambda x: pd.np.where(\n", + " x.apply(len) > max_chars,\n", + " x.str.slice(0, max_chars) + \"[...]\",\n", + " x\n", + " ))\n", + " )\n", + " \n", + " print(\"\\n\\n\".join(examples) + \"\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Docket 14-28\n", + "\n", + "BuzzFeed News identified the phrases below based on extensive examination of the 14-28 docket, and by cross-referencing them with [this December 2014 Sunlight Foundation analysis](http://web.archive.org/web/20150301070951/http://sunlightfoundation.com/blog/2014/12/16/one-group-dominates-the-second-round-of-net-neutrality-comments/).\n", + "\n", + "The `AC-` comments use language from American Commitment's comment campaign. It is possible that entities other than American Commitment submitted comments that used the same language. \n", + "\n", + "Note: The final phrase in the list below also appears alongside some of the other permutations; but because it is the final phrase in the list, only comments that don't match the other phrasings receive this classification.\n", + "\n", + "Please see this repository's landing page, and the associated BuzzFeed News article, for additional context. (E.g., not all comments are indivudally retreivable from the FCC's public portal.)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "ac_patterns_main = [\n", + " # Earlier set of comments\n", + " \"The federal government can use their power over the internet to direct content\", \n", + " \"because of high barriers to entry and a disastrous lack of competition\", \n", + " \"federal bureaucrats will slow down the process and protect prevailing interests first\", \n", + " \"blossoming in America today, largely due to the internet\", \n", + " \"Government will make it impossible for internet providers to upgrade service\", \n", + " \"government will naturally favor entrenched special interests, rather than upstart companies\", \n", + " \"will begin to be mismanaged, like many other government-run industries\", \n", + " \"put directly in the hands of bureaucrats, instead of the free market\", \n", + " \"will result in worse service, even as costs continue to skyrocket\", \n", + " \"Our options for cheap, high-speed, high-performing internet providers\", \n", + "\n", + " # Later set of comments\n", + " \"Left-wing extremists have been crying wolf\", \n", + " \"The federal government needs to keep its hands off the Internet\", \n", + " \"Before the FCC places regulatory handcuffs on Internet providers\", \n", + " \"The notion that the internet is broken and needs repair is simply not true\", \n", + " \"will send the crown jewel of the US economy into an economic tailspin\", \n", + " \"no longer acting in the interests of the American people\", \n", + " \"just another slow-moving government-controlled mess\", \n", + " \"defend ourselves against power-hungry bureaucrats\", \n", + " \"simply another attempt by the federal government to take control of another sector of the economy\", \n", + " \"Millions of liberal fools demanding you reduce the Internet\", \n", + " \"FCC is clearly ignoring the will of the American people\", \n", + " \"devastate private investment with the force of an atomic bomb\", \n", + " \"without being slowed by bureaucratic inertia\", \n", + " \"A small fringe of the extremist left has been demanding\", \n", + " \"a tiny minority of far-left political activists\", \n", + " \"ultimate goal is to get rid of the media capitalists\", \n", + " \"created economic and human wreckage in their wake\", \n", + " \"increase its own power at the expense of the free people\", \n", + " \"it will have proven itself to be an unaccountable agency\", \n", + " \"subjecting it to 1930s-style regulations meant for telephone monopolies\", \n", + " \"Government regulation of Internet services would chase investment\", \n", + " \"it will seriously degrade the Internet we have\", \n", + " \"it can and should suffer the consequences\", \n", + " \"taking such reckless actions to gain control over the Internet\", \n", + " \"simply is no evidence to back up the dire claims of disaster\", \n", + "]\n", + "\n", + "ac_patterns_other = [\n", + " \"Like many Americans, I believe that the internet should remain free of government\",\n", + " \"As an American citizen, I wanted to voice my opposition to the FCC\",\n", + "]\n", + "\n", + "ac_pattern_desciptions = (\n", + " [ (p, f\"AC-{i:02d}\") for i, p in enumerate(ac_patterns_main) ] +\n", + " [ (p, f\"AC-other\") for p in ac_patterns_other ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notes:\n", + "\n", + "- The `AC-XX` classification names below are based simply on the order in which they appear above. The numbers have no independent meaning.\n", + "\n", + "- The `AC-other` classification indicates that key language (the two phrases in `ac_patterns_other` above) from American Commitment appears in the comment, but not any of the other phrases." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AC-00 1261\n", + "AC-01 1233\n", + "AC-02 1246\n", + "AC-03 1232\n", + "AC-04 1269\n", + "AC-05 1208\n", + "AC-06 1210\n", + "AC-07 1207\n", + "AC-08 1202\n", + "AC-09 1186\n", + "AC-10 25801\n", + "AC-11 25781\n", + "AC-12 25951\n", + "AC-13 26012\n", + "AC-14 25667\n", + "AC-15 25879\n", + "AC-16 25727\n", + "AC-17 26009\n", + "AC-18 25658\n", + "AC-19 25788\n", + "AC-20 25924\n", + "AC-21 25914\n", + "AC-22 25950\n", + "AC-23 25865\n", + "AC-24 25864\n", + "AC-25 25620\n", + "AC-26 26044\n", + "AC-27 25932\n", + "AC-28 25745\n", + "AC-29 26024\n", + "AC-30 25624\n", + "AC-31 25880\n", + "AC-32 25691\n", + "AC-33 25615\n", + "AC-34 25836\n", + "AC-other 6\n", + "[other] 1396620\n", + "Name: group, dtype: int64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comments_14_28_classified = (\n", + " comments_14_28\n", + " .pipe(\n", + " add_classification,\n", + " ac_pattern_desciptions\n", + " )\n", + ")\n", + "\n", + "(\n", + " comments_14_28_classified\n", + " [\"group\"]\n", + " .value_counts()\n", + " .sort_index()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The total number of comments and unique email addreses for all `AC-`-classified comments above:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count
comments658061
unique_email_addresses551855
\n", + "
" + ], + "text/plain": [ + " count\n", + "comments 658061\n", + "unique_email_addresses 551855" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " comments_14_28_classified\n", + " .loc[lambda df: df[\"group\"] != \"[other]\"]\n", + " .pipe(lambda df: pd.Series({\n", + " \"comments\": len(df),\n", + " \"unique_email_addresses\": df[\"email_address\"].nunique()\n", + " }))\n", + " .to_frame(\"count\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Dates submitted\n", + "\n", + "The analysis below demonstrates that the following:\n", + "\n", + "- Comments `AC-00`-`AC-09` share a similar distribution of dates submitted\n", + "- Comments `AC-10`-`AC-34` also share a similar distribution of dates submitted, but distinct from `AC-00`-`AC-09`\n", + "\n", + "Additional notes:\n", + "\n", + "- Dates below are `MM-DD`, for 2014\n", + "\n", + "- Dates include only those with at least 200 total `AC-` classified comments (overall), to reduce noise of stray dates that contain relatively few matching comments" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
date07-1407-1607-1709-1109-1209-1309-1409-1509-1609-1709-1809-1909-2209-2309-24
group
AC-0045909305000000000000
AC-0139905287000000000000
AC-0252888304000000000000
AC-0345893293000000000000
AC-0480902284000000000000
AC-0536872300000000000000
AC-0647906257000000000000
AC-0748859299000000000000
AC-0845854303000000000000
AC-0944878263000000000000
AC-100002478389645913510645821672143225631128011
AC-110002483391146133574636522042097231741187512
AC-120002481389745773550652722072197224761127912
AC-130002488390946043605643822522180241721238212
AC-140002470391145283541627822342168256711197511
AC-150002470389345723540645122692144254751177711
AC-160002477391745623541638521962144231691088312
AC-170002496390745093558651122492290231641057411
AC-180002498390345693584635221902067229551178012
AC-19000247539244525349165232183216525858947714
AC-200002481391045683572656221532150236861137912
AC-210002476390045733463649322972188239751138312
AC-220002486388645573522650722252240241821108113
AC-230002484391746123492644121712246235681087712
AC-240002469390846373511650721752169222751007812
AC-25000247239004562348264262099219124064957811
AC-260002491391645813575646222572210248821317813
AC-27000248338964599363064402195219224170957613
AC-280002475391445663542636622332140233621237511
AC-290002477389646513489656022132228244641127712
AC-300002501389645853554633221842100220561017813
AC-310002490390746173526643422072179245711078313
AC-320002477389646553462636721752161233561197811
AC-33000248339004544349463512205216121868987912
AC-340002477388645883572644121822173243741078112
AC-other000000000000000
\n", + "
" + ], + "text/plain": [ + "date 07-14 07-16 07-17 09-11 09-12 09-13 09-14 09-15 09-16 \\\n", + "group \n", + "AC-00 45 909 305 0 0 0 0 0 0 \n", + "AC-01 39 905 287 0 0 0 0 0 0 \n", + "AC-02 52 888 304 0 0 0 0 0 0 \n", + "AC-03 45 893 293 0 0 0 0 0 0 \n", + "AC-04 80 902 284 0 0 0 0 0 0 \n", + "AC-05 36 872 300 0 0 0 0 0 0 \n", + "AC-06 47 906 257 0 0 0 0 0 0 \n", + "AC-07 48 859 299 0 0 0 0 0 0 \n", + "AC-08 45 854 303 0 0 0 0 0 0 \n", + "AC-09 44 878 263 0 0 0 0 0 0 \n", + "AC-10 0 0 0 2478 3896 4591 3510 6458 2167 \n", + "AC-11 0 0 0 2483 3911 4613 3574 6365 2204 \n", + "AC-12 0 0 0 2481 3897 4577 3550 6527 2207 \n", + "AC-13 0 0 0 2488 3909 4604 3605 6438 2252 \n", + "AC-14 0 0 0 2470 3911 4528 3541 6278 2234 \n", + "AC-15 0 0 0 2470 3893 4572 3540 6451 2269 \n", + "AC-16 0 0 0 2477 3917 4562 3541 6385 2196 \n", + "AC-17 0 0 0 2496 3907 4509 3558 6511 2249 \n", + "AC-18 0 0 0 2498 3903 4569 3584 6352 2190 \n", + "AC-19 0 0 0 2475 3924 4525 3491 6523 2183 \n", + "AC-20 0 0 0 2481 3910 4568 3572 6562 2153 \n", + "AC-21 0 0 0 2476 3900 4573 3463 6493 2297 \n", + "AC-22 0 0 0 2486 3886 4557 3522 6507 2225 \n", + "AC-23 0 0 0 2484 3917 4612 3492 6441 2171 \n", + "AC-24 0 0 0 2469 3908 4637 3511 6507 2175 \n", + "AC-25 0 0 0 2472 3900 4562 3482 6426 2099 \n", + "AC-26 0 0 0 2491 3916 4581 3575 6462 2257 \n", + "AC-27 0 0 0 2483 3896 4599 3630 6440 2195 \n", + "AC-28 0 0 0 2475 3914 4566 3542 6366 2233 \n", + "AC-29 0 0 0 2477 3896 4651 3489 6560 2213 \n", + "AC-30 0 0 0 2501 3896 4585 3554 6332 2184 \n", + "AC-31 0 0 0 2490 3907 4617 3526 6434 2207 \n", + "AC-32 0 0 0 2477 3896 4655 3462 6367 2175 \n", + "AC-33 0 0 0 2483 3900 4544 3494 6351 2205 \n", + "AC-34 0 0 0 2477 3886 4588 3572 6441 2182 \n", + "AC-other 0 0 0 0 0 0 0 0 0 \n", + "\n", + "date 09-17 09-18 09-19 09-22 09-23 09-24 \n", + "group \n", + "AC-00 0 0 0 0 0 0 \n", + "AC-01 0 0 0 0 0 0 \n", + "AC-02 0 0 0 0 0 0 \n", + "AC-03 0 0 0 0 0 0 \n", + "AC-04 0 0 0 0 0 0 \n", + "AC-05 0 0 0 0 0 0 \n", + "AC-06 0 0 0 0 0 0 \n", + "AC-07 0 0 0 0 0 0 \n", + "AC-08 0 0 0 0 0 0 \n", + "AC-09 0 0 0 0 0 0 \n", + "AC-10 2143 225 63 112 80 11 \n", + "AC-11 2097 231 74 118 75 12 \n", + "AC-12 2197 224 76 112 79 12 \n", + "AC-13 2180 241 72 123 82 12 \n", + "AC-14 2168 256 71 119 75 11 \n", + "AC-15 2144 254 75 117 77 11 \n", + "AC-16 2144 231 69 108 83 12 \n", + "AC-17 2290 231 64 105 74 11 \n", + "AC-18 2067 229 55 117 80 12 \n", + "AC-19 2165 258 58 94 77 14 \n", + "AC-20 2150 236 86 113 79 12 \n", + "AC-21 2188 239 75 113 83 12 \n", + "AC-22 2240 241 82 110 81 13 \n", + "AC-23 2246 235 68 108 77 12 \n", + "AC-24 2169 222 75 100 78 12 \n", + "AC-25 2191 240 64 95 78 11 \n", + "AC-26 2210 248 82 131 78 13 \n", + "AC-27 2192 241 70 95 76 13 \n", + "AC-28 2140 233 62 123 75 11 \n", + "AC-29 2228 244 64 112 77 12 \n", + "AC-30 2100 220 56 101 78 13 \n", + "AC-31 2179 245 71 107 83 13 \n", + "AC-32 2161 233 56 119 78 11 \n", + "AC-33 2161 218 68 98 79 12 \n", + "AC-34 2173 243 74 107 81 12 \n", + "AC-other 0 0 0 0 0 0 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " comments_14_28_classified\n", + " .loc[lambda df: df[\"group\"] != \"[other]\"]\n", + " .assign(\n", + " date = lambda df: df[\"date\"].str.slice(5, 10)\n", + " )\n", + " .groupby([\"group\", \"date\"])\n", + " .size()\n", + " .unstack()\n", + " .fillna(0)\n", + " .astype(int)\n", + " .loc[:, lambda df: df.sum() >= 200]\n", + " \n", + " # Order columns by date\n", + " .pipe(lambda df: df[[c for c in sorted(df.columns)]])\n", + " \n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example comments" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== AC-10 ===\n", + "\n", + "The Internet is not broken, and does not need to be fixed. Left-wing extremists have been crying wolf for the past decade about the harm to the Internet if the Federal government didn?t regulate it. Not only were they wrong, but the Internet has exploded with innovation. Do not regulate the Internet. The best way to keep it open and free is what has kept it open and free all along ? no government intervention.\n", + "\n", + "The Internet is not broken, and does not need to be fixed. Left-wing extremists have been crying wolf for the past decade about the harm to the Internet if the Federal government didn?t regulate it. Not only were they wrong, but the Internet has exploded with innovation. Do not regulate the Internet. The best way to keep it open and free is what has kept it open and free all along ? no government intervention.\n", + "\n", + "The Internet is not broken, and does not need to be fixed. Left-wing extremists have been crying wolf for the past decade about the harm to the Internet if the Federal government didn?t regulate it. Not only were they wrong, but the Internet has exploded with innovation. Do not regulate the Internet. The best way to keep it open and free is what has kept it open and free all along ? no government intervention.\n", + "\n", + "\n", + "=== AC-27 ===\n", + "\n", + "The American people are watching a Federal Communications Commission (FCC) that is not seeking to make the Internet better, but instead seeking to regulate it. The FCC could improve broadband delivery by auctioning off much-needed spectrum. Or it could get rid of some of its own burdensome rules that prevent companies from attracting investors and innovating. But instead of doing these things that would improve the Internet, the FCC is wasting its time in an obsessive drive to regulate. This [...]\n", + "\n", + "The American people are watching a Federal Communications Commission (FCC) that is not seeking to make the Internet better, but instead seeking to regulate it. The FCC could improve broadband delivery by auctioning off much-needed spectrum. Or it could get rid of some of its own burdensome rules that prevent companies from attracting investors and innovating. But instead of doing these things that would improve the Internet, the FCC is wasting its time in an obsessive drive to regulate. This[...]\n", + "\n", + "7522706506.txtThe American people are watching a Federal Communications Commission (FCC) that is not seeking to make the Internet better, but instead seeking to regulate it. The FCC could improve broadband delivery by auctioning off much-needed spectrum. Or it could get rid of some of its own burdensome rules that prevent companies from attracting investors and innovating. But instead of doing these things that would improve the Internet, the FCC is wasting its time in an obsessive drive to r[...]\n", + "\n", + "\n", + "=== [other] ===\n", + "\n", + "Dear Chairman Wheeler:We are writing to urge you to implement strong and unambiguous net neutrality rules that protect the Internet from discrimination and other practices that will impede its ability to serve our democracy, empower consumers, and fuel economic growth. Erecting toll booths or designating fast lanes on the information superhighway wouldstifle free speech, limit consumer choice, and thwart innovation. The FCC must act in a clear and decisive way to ensure the Internet does not be[...]\n", + "\n", + "Dear Chairman Wheeler:We are writing to urge you to implement strong and unambiguous net neutrality rules that protect the Internet from discrimination and other practices that will impede its ability to serve our democracy, empower consumers, and fuel economic growth. Erecting toll booths or designating fast lanes on the information superhighway wouldstifle free speech, limit consumer choice, and thwart innovation. The FCC must act in a clear and decisive way to ensure the Internet does not be[...]\n", + "\n", + "7522187451.txtDear Chairman Wheeler:We are writing to urge you to implement strong and unambiguous net neutrality rules that protect the Internet from discrimination and other practices that will impede its ability to serve our democracy, empower consumers, and fuel economic growth. Erecting toll booths or designating fast lanes on the information superhighway wouldstifle free speech, limit consumer choice, and thwart innovation. The FCC must act in a clear and decisive way to ensure the Intern[...]\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print_example_comments(\n", + " comments_14_28_classified\n", + " .loc[lambda df: df[\"group\"].isin([\"AC-10\", \"AC-27\", \"[other]\"])],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compare the above timing and language for group `AC-27` to the [comment attributed to Annie Reeves](https://www.fcc.gov/ecfs/filing/6019076835), received by the FCC on September 15, 2014:\n", + "\n", + "> The American people are watching a Federal Communications Commission (FCC) that is not seeking to make the Internet better, but instead seeking to regulate it. The FCC could improve broadband delivery by auctioning off much-needed spectrum. Or it could get rid of some of its own burdensome rules that prevent companies from attracting investors and innovating. But instead of doing these things that would improve the Internet, the FCC is wasting its time in an obsessive drive to regulate. This tells the American people that once again, a Washington agency is working in aself-interested way to increase its own power at the expense of the free people it is meant to serve." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Docket 16-42\n", + "\n", + "Here, we identify two very large sets of comments in this docket, by searching for the short phrases below. Please see the associated BuzzFeed News article for context.\n", + "\n", + "The \"American Commitment\" set of comments is labeled as such because it uses language from [that organization's comment campaign](http://web.archive.org/web/20160403182941/https://www.americancommitment.org/cablebox-petition). (The text of comments appear to be generated algorithmically, selecting randomly from sets of pre-selected words and phrases, but the phrase used here for classification is static — it does not change across the comments.) To be sure, it is possible the comments were submitted by entities other than American Commitment, using the same language; the FCC's public portal does not specify who submitted these comments. " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'over $200' 104816\n", + "American Commitment 101783\n", + "[other] 75175\n", + "Name: group, dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comments_16_42_classified = (\n", + " comments_16_42\n", + " .pipe(\n", + " add_classification,\n", + " [\n", + " (\"cloud-based video on demand, and apps providing news\", \"American Commitment\"),\n", + " (\"A cable subscriber pays over \\$200\", \"'over $200'\"),\n", + " ]\n", + " )\n", + ")\n", + "\n", + "(\n", + " comments_16_42_classified\n", + " [\"group\"]\n", + " .value_counts()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Dates submitted, by two main groups of comments" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2016-04-19 10499\n", + "2016-04-20 59247\n", + "2016-04-21 35070\n", + "Name: date, dtype: int64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " comments_16_42_classified\n", + " .loc[lambda df: df[\"group\"] == \"'over $200'\"]\n", + " [\"date\"]\n", + " .value_counts()\n", + " .sort_index()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2016-02-22 1\n", + "2016-05-16 12293\n", + "2016-05-17 55852\n", + "2016-05-18 33637\n", + "Name: date, dtype: int64" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " comments_16_42_classified\n", + " .loc[lambda df: df[\"group\"] == \"American Commitment\"]\n", + " [\"date\"]\n", + " .value_counts()\n", + " .sort_index()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Note: The 2016-02-22 comment above appears to stem from a data-entry mistake on the FCC's website. There, [the comment](https://www.fcc.gov/ecfs/filing/60001484317)'s text seems to suggests that the language actually came from a [comment with ID 60001843102](https://www.fcc.gov/ecfs/filing/60001843102); that comment, in turn, says it was received on May 18, 2016.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example comments" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== 'over $200' ===\n", + "\n", + "60001650840.txtA cable subscriber pays over $200 per year to rent a box from the cable companies that are already protected by government. This kills competition, limits consumer choice, and lifts up cable profits that are already excessive.Allow the free market to work and unlock the box to open competition and end the monopoly that cable companies have over our televisions.Page 1\n", + "\n", + "60001633497.txtA cable subscriber pays over $200 per year to rent a box from the cable companies that are already protected by government. This kills competition, limits consumer choice, and lifts up cable profits that are already excessive.Allow the free market to work and unlock the box to open competition and end the monopoly that cable companies have over our televisions.Page 1\n", + "\n", + "60001621406.txtA cable subscriber pays over $200 per year to rent a box from the cable companies that are already protected by government. This kills competition, limits consumer choice, and lifts up cable profits that are already excessive.Allow the free market to work and unlock the box to open competition and end the monopoly that cable companies have over our televisions.Page 1\n", + "\n", + "\n", + "=== American Commitment ===\n", + "\n", + "60001870988.txtThe marketplace for video content is thriving and extremely competitive, offering a vast array of video streaming services, cloud-based video on demand, and apps providing news, cinema and programming. This market is rapidly innovating beyond thetraditional set-top box to new applications and devices with more choices than ever. Past government attempts to control set-top boxes have been a complete failure. Yet another failed attempt at top-down government regulation will only pu[...]\n", + "\n", + "60001888486.txtThe exchange for video content is booming and incredibly competitive, offering a wide array of video streaming services, cloud-based video on demand, and apps providing news, cinema and programming. This market is swiftly innovating beyond thetraditional set-top box to new applications and devices with more options than ever. Past Commission attempts to control set-top boxes have been a complete failure. Yet another failed attempt at one-size-fits-all government regulation will o[...]\n", + "\n", + "60001883996.txtThe exchange for video content is roaring and incredibly competitive, offering a vast array of video streaming services, cloud-based video on demand, and apps providing news, cinema and programming. This market is quickly innovating beyond thetraditional set-top box to new applications and devices with more options than ever. Past FCC attempts to regulate set-top boxes have been a complete failure. Yet another failed attempt at heavy-handed government regulation will only put the[...]\n", + "\n", + "\n", + "=== [other] ===\n", + "\n", + "60001976192.txtI oppose unnecessary set-top box regulations that will mean higher bills, fewer choices, and less privacy on TV. The television and video market today is full of great choices, why put such a healthy market at risk with complex and unnecessary new mandates?Page 1\n", + "\n", + "60001962194.txtI oppose unnecessary set-top box regulations that will mean higher bills, fewer choices, and less privacy on TV. The television and video market today is full of great choices, why put such a healthy market at risk with complex and unnecessary new mandates?Page 1\n", + "\n", + "60001991447.txtI oppose unnecessary set-top box regulations that will mean higher bills, fewer choices, and less privacy on TV. The television and video market today is full of great choices, why put such a healthy market at risk with complex and unnecessary new mandates?Page 1\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print_example_comments(comments_16_42_classified)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compare the above timing and language to the [comment attributed to Annie Reeves](https://www.fcc.gov/ecfs/filing/60001803771), received by the FCC on May 17, 2016:\n", + "\n", + "> The market for video content is booming and extremely competitive, offering a vast array of video streaming services, cloud-based video on demand, and apps providing news, cinema and programming. This market is swiftly innovating beyond the traditional set-top box to new applications and devices with more options than ever. Past Commission attempts to regulate set-top boxes have been a complete failure. Yet another failed attempt at heavy-handed government regulation will only stifle innovation and benefit companies with political influence rather than companies thatprovide what consumers want. We don't need the federal government to fix what isn'tbroken -- I urge you to reject the proposed rule." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Examine email address crossover between 16-42 and bulk-uploaded 17-108 comments\n", + "\n", + "Here, we calculate the proportion of commenters from docket 16-42 that later appeared in comments bulk-uploaded to docket 17-108, and observe a very high rate of overlap between the email addresses associated with comments that used American Commitment's language in docket 16-42 and the email addresses listed in comments bulk-uploaded by Media Bridge. We find the same for commenters' full names plus physical addresses." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
num_emailsemail_isin_17_108_nonmbemail_isin_17_108_mbname_and_location_isin_17_108_mb
group
American Commitment1002520.02310.99870.9987
'over $200'1004820.02430.06010.0566
\n", + "
" + ], + "text/plain": [ + " num_emails email_isin_17_108_nonmb \\\n", + "group \n", + "American Commitment 100252 0.0231 \n", + "'over $200' 100482 0.0243 \n", + "\n", + " email_isin_17_108_mb name_and_location_isin_17_108_mb \n", + "group \n", + "American Commitment 0.9987 0.9987 \n", + "'over $200' 0.0601 0.0566 " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " comments_16_42_classified\n", + " [[\n", + " \"email_address\",\n", + " \"name_and_location\",\n", + " \"group\",\n", + " ]]\n", + " .drop_duplicates()\n", + " .dropna()\n", + " .assign(\n", + " email_isin_17_108_nonmb = lambda df: (\n", + " df[\"email_address\"].notnull() & df[\"email_address\"].isin(\n", + " bulk_uploads_17_108\n", + " .loc[lambda df: df[\"uploader\"] != \"shane@mediabridgellc.com\"]\n", + " [\"email_address\"]\n", + " )\n", + " ),\n", + " email_isin_17_108_mb = lambda df: (\n", + " df[\"email_address\"].notnull() & df[\"email_address\"].isin(\n", + " bulk_uploads_17_108\n", + " .loc[lambda df: df[\"uploader\"] == \"shane@mediabridgellc.com\"]\n", + " [\"email_address\"]\n", + " )\n", + " ),\n", + " name_and_location_isin_17_108_mb = lambda df: (\n", + " df[\"name_and_location\"].isin(\n", + " bulk_uploads_17_108\n", + " .loc[lambda df: df[\"uploader\"] == \"shane@mediabridgellc.com\"]\n", + " [\"name_and_location\"]\n", + " )\n", + " ),\n", + " )\n", + " .groupby(\"group\")\n", + " .pipe(lambda grp: pd.DataFrame({\n", + " \"num_emails\": grp.size(),\n", + " \"email_isin_17_108_nonmb\": grp[\"email_isin_17_108_nonmb\"].mean().round(4),\n", + " \"email_isin_17_108_mb\": grp[\"email_isin_17_108_mb\"].mean().round(4),\n", + " \"name_and_location_isin_17_108_mb\": grp[\"name_and_location_isin_17_108_mb\"].mean().round(4),\n", + " }))\n", + " .loc[lambda df: df[\"num_emails\"] >= 1000]\n", + " .sort_values(\"email_isin_17_108_mb\", ascending = False)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analyze 17-108 bulk-uploads vis-a-vis Have I Been Pwned\n", + "\n", + "In this section, we take random samples of email addresses the comments bulk-uploaded to Docket 17-108, and calculate the rates at which they have appeared in the data breaches tracked by Have I Been Pwned. We focus on the accounts that uploaded comments containing 10,000+ distinct email addresses.\n", + "\n", + "*Note: The HIBP data has already been been fetched and saved, but the code used to fetch the data is included here for reference, and for reuse by other researchers.*" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " uuid_lookup = pd.read_csv(\n", + " BASE_PATH + \"bulk-uploads-17-108-uuid-lookup.csv\",\n", + " dtype = str,\n", + " )\n", + " \n", + " assert uuid_lookup[\"email_address_uuid\"].value_counts().max() == 1\n", + " print(f\"{len(uuid_lookup):,d}\")\n", + "except:\n", + " uuid_lookup = pd.DataFrame(None, columns = [ \"email_address\", \"email_address_uuid\" ])" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_HIBP_URL = \"https://haveibeenpwned.com/api/v3/breachedaccount/\"\n", + "from json import JSONDecodeError\n", + "\n", + "def fetch_hipb_results(email_address):\n", + " while True:\n", + " try:\n", + " res = requests.get(\n", + " f\"{BASE_HIBP_URL}{email_address.strip()}\",\n", + " headers = {\n", + " 'hibp-api-key': HIBP_KEY,\n", + " },\n", + " )\n", + " if res.from_cache == False:\n", + " time.sleep(1.5)\n", + "\n", + " # Check that JSON is parseable\n", + " if res.content != b\"\":\n", + " res.json()\n", + " if \"message\" in res.json():\n", + " raise Exception(\"HIPB error: {res.json()['message']}\")\n", + "\n", + " except requests.RequestException:\n", + " sys.stderr.write(f\"\\nException; sleeping for 10 seconds\\n\")\n", + " time.sleep(10) \n", + " continue\n", + " \n", + " except JSONDecodeError as e:\n", + " sys.stderr.write(f\"\\nERROR: <{email_address}>\\n\")\n", + " sys.stderr.write(f\"{e}\\n\")\n", + " sys.stderr.write(f\"{res.content}\\n\")\n", + " return [ { \"email_address\": email_address, \"breach\": \"[error]\" } ]\n", + "\n", + " if res.status_code == 429:\n", + " sleep_int = int(res.headers[\"Retry-After\"])\n", + " sys.stderr.write(f\"\\nSleeping for {sleep_int + 1} seconds\")\n", + " time.sleep(sleep_int)\n", + " continue\n", + " \n", + " if res.content == b\"\" or res.status_code == 404:\n", + " return [ { \"email_address\": email_address, \"breach\": \"[none]\" } ]\n", + "\n", + " else:\n", + " return [ { \"email_address\": email_address, \"breach\": x[\"Name\"] } for x in res.json() ]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following function creates a sample — or a grouped set of samples — from a given set of comments. Before sampling, the code removes blank email addresses and those with non-standard characters." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "def create_sample(df, grouping = [], n = 1000, random_state = RANDOM_STATE):\n", + " clean = (\n", + " df\n", + " .loc[lambda df: df[\"email_address\"].notnull()]\n", + " .loc[lambda df: df[\"email_address_nonstandard\"] == 0]\n", + " .drop_duplicates(subset = [ \"email_address\" ] + grouping)\n", + " )\n", + " \n", + " sampler = lambda df: df.sample(n, random_state = random_state)\n", + " \n", + " if len(grouping):\n", + " return (\n", + " clean\n", + " .groupby(grouping)\n", + " .apply(sampler)\n", + " .reset_index(drop = True)\n", + " )\n", + " else:\n", + " return clean.pipe(sampler)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "def get_breaches(df, save_path, save = True, use_saved = True):\n", + " if use_saved and os.path.exists(save_path):\n", + " return pd.read_csv(save_path, dtype = str)\n", + " else:\n", + " breaches_raw = pd.concat(map(pd.DataFrame, (\n", + " df\n", + " .rename(columns = {\n", + " \"email_address\": \"email_address_uuid\"\n", + " })\n", + " .merge(\n", + " uuid_lookup,\n", + " how = \"left\",\n", + " on = [ \"email_address_uuid\" ]\n", + " )\n", + " [\"email_address\"]\n", + " .progress_apply(fetch_hipb_results)\n", + " ))).drop_duplicates()\n", + " \n", + " breaches = (\n", + " breaches_raw\n", + " .merge(\n", + " uuid_lookup,\n", + " how = \"left\",\n", + " on = [ \"email_address\" ]\n", + " )\n", + " .drop(columns = [ \"email_address\" ])\n", + " .rename(columns = {\n", + " \"email_address_uuid\": \"email_address\",\n", + " })\n", + " )\n", + " \n", + " if save:\n", + " breaches.to_csv(save_path, index = False)\n", + "\n", + " return breaches" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The function below calculates the breach rates for groups of sampled comments, for each breach found." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_breach_rates(sample, breaches):\n", + " return (\n", + " sample\n", + " [[\n", + " \"email_address\",\n", + " \"uploader\",\n", + " ]]\n", + " \n", + " .merge(\n", + " breaches,\n", + " how = \"left\",\n", + " on = [ \"email_address\" ],\n", + " )\n", + " .assign(breached = 1)\n", + " .set_index([\n", + " \"uploader\",\n", + " \"email_address\",\n", + " \"breach\",\n", + " ])\n", + " [\"breached\"]\n", + " .unstack()\n", + " .fillna(0)\n", + " .astype(int)\n", + " # At this point, we have a matrix of uploader+email x breach\n", + " # where the values are 1 if breached and 0 if not\n", + " \n", + " # Now, we group by uploader and calculate the proportion of\n", + " # emails breached\n", + " .groupby([ \"uploader\" ])\n", + " .mean()\n", + " \n", + " # Then we return the data frame to a \"tidy\" format:\n", + " # uploader|breach|rate\n", + " .stack()\n", + " .sort_values(ascending = False)\n", + " .to_frame(\"rate\")\n", + " .reset_index()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 17-108 by bulk uploader\n", + "\n", + "Limited here to the accounts that uploaded comments containing 10,000+ distinct email addresses." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datecommentsfileuploaderemail_address_nonstandardemail_addressemail_domainname_and_location
05/15/17In 2015, wealthy leftist billionaires and powe...FOI-14090-2017527.csvesmisc@mac.com0939bfae2-62d1-47de-b009-c2abc6b681f5yahoo.com8930069a-021b-4263-9c3b-a3923af9a9dc
18/5/17Before leaving office, the Obama Administratio...CFIF_1_25000_08052017_4 (1).csvesmisc@mac.com0f9a12339-56cb-4540-9adc-fc6238428f49gmail.com6c65ec31-5135-4500-99c5-bba309b415fb
28/6/17Before leaving office, the Obama Administratio...CFIF_1_25000_08062017_2.csvesmisc@mac.com0fcf0991a-0ed7-408b-8e52-4735baccd906yahoo.com6dfa9546-ad61-404a-bec7-48464be021b4
37/29/17Before leaving office, the Obama Administratio...CFIF_1_40000_07292017.csvesmisc@mac.com0ee33e2a5-854f-471b-adb1-1ff62d69bf46gmail.com6d99eb3f-9242-440d-be2a-c7f7ae3b4e91
45/9/17Obama's Federal Communications Commission (FCC...T2017510-2.csvesmisc@mac.com04d294840-8365-4d34-a5c4-c09f6b8bc01dicloud.comd132203a-a146-4043-b097-d6606498309f
\n", + "
" + ], + "text/plain": [ + " date comments \\\n", + "0 5/15/17 In 2015, wealthy leftist billionaires and powe... \n", + "1 8/5/17 Before leaving office, the Obama Administratio... \n", + "2 8/6/17 Before leaving office, the Obama Administratio... \n", + "3 7/29/17 Before leaving office, the Obama Administratio... \n", + "4 5/9/17 Obama's Federal Communications Commission (FCC... \n", + "\n", + " file uploader email_address_nonstandard \\\n", + "0 FOI-14090-2017527.csv esmisc@mac.com 0 \n", + "1 CFIF_1_25000_08052017_4 (1).csv esmisc@mac.com 0 \n", + "2 CFIF_1_25000_08062017_2.csv esmisc@mac.com 0 \n", + "3 CFIF_1_40000_07292017.csv esmisc@mac.com 0 \n", + "4 T2017510-2.csv esmisc@mac.com 0 \n", + "\n", + " email_address email_domain \\\n", + "0 939bfae2-62d1-47de-b009-c2abc6b681f5 yahoo.com \n", + "1 f9a12339-56cb-4540-9adc-fc6238428f49 gmail.com \n", + "2 fcf0991a-0ed7-408b-8e52-4735baccd906 yahoo.com \n", + "3 ee33e2a5-854f-471b-adb1-1ff62d69bf46 gmail.com \n", + "4 4d294840-8365-4d34-a5c4-c09f6b8bc01d icloud.com \n", + "\n", + " name_and_location \n", + "0 8930069a-021b-4263-9c3b-a3923af9a9dc \n", + "1 6c65ec31-5135-4500-99c5-bba309b415fb \n", + "2 6dfa9546-ad61-404a-bec7-48464be021b4 \n", + "3 6d99eb3f-9242-440d-be2a-c7f7ae3b4e91 \n", + "4 d132203a-a146-4043-b097-d6606498309f " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_17_108_bulk_uploads = (\n", + " bulk_uploads_17_108\n", + " .loc[lambda df: df[\"uploader\"].isin(\n", + " uploader_metrics\n", + " .loc[lambda df: df[\"unique_emails\"] >= 10000]\n", + " .index\n", + " )]\n", + " .pipe(\n", + " create_sample,\n", + " grouping = [ \"uploader\" ],\n", + " n = 1000\n", + " )\n", + ")\n", + "\n", + "sample_17_108_bulk_uploads.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "esmisc@mac.com 1000\n", + "fccfreedom@hmamail.com 1000\n", + "karen@momsrising.org 1000\n", + "kurt@demandprogress.org 1000\n", + "meaghan@mandatemedia.com 1000\n", + "mike@fightforthefuture.org 1000\n", + "ncatalano@ofa.us 1000\n", + "shane@mediabridgellc.com 1000\n", + "Name: uploader, dtype: int64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_17_108_bulk_uploads[\"uploader\"].value_counts().sort_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
breachemail_address
08tracks939bfae2-62d1-47de-b009-c2abc6b681f5
1Animoto939bfae2-62d1-47de-b009-c2abc6b681f5
2MindJolt939bfae2-62d1-47de-b009-c2abc6b681f5
3ModernBusinessSolutions939bfae2-62d1-47de-b009-c2abc6b681f5
4RiverCityMedia939bfae2-62d1-47de-b009-c2abc6b681f5
\n", + "
" + ], + "text/plain": [ + " breach email_address\n", + "0 8tracks 939bfae2-62d1-47de-b009-c2abc6b681f5\n", + "1 Animoto 939bfae2-62d1-47de-b009-c2abc6b681f5\n", + "2 MindJolt 939bfae2-62d1-47de-b009-c2abc6b681f5\n", + "3 ModernBusinessSolutions 939bfae2-62d1-47de-b009-c2abc6b681f5\n", + "4 RiverCityMedia 939bfae2-62d1-47de-b009-c2abc6b681f5" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "breaches_17_108_bulk_uploads = (\n", + " sample_17_108_bulk_uploads\n", + " .pipe(get_breaches, \"../data/breaches-17-108-bulk-uploads-sample.csv\")\n", + ")\n", + "\n", + "breaches_17_108_bulk_uploads.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Most common breach-uploader combinations:" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
uploaderbreachrate
0shane@mediabridgellc.comModernBusinessSolutions0.942
1shane@mediabridgellc.comRiverCityMedia0.807
2fccfreedom@hmamail.comVerificationsIO0.782
3shane@mediabridgellc.comVerificationsIO0.743
4fccfreedom@hmamail.comRiverCityMedia0.645
5esmisc@mac.comVerificationsIO0.625
6esmisc@mac.comRiverCityMedia0.565
7fccfreedom@hmamail.comModernBusinessSolutions0.466
8ncatalano@ofa.usVerificationsIO0.463
9karen@momsrising.orgVerificationsIO0.459
10mike@fightforthefuture.orgVerificationsIO0.435
11meaghan@mandatemedia.comVerificationsIO0.435
12kurt@demandprogress.orgVerificationsIO0.412
13karen@momsrising.orgRiverCityMedia0.377
14meaghan@mandatemedia.comRiverCityMedia0.364
15ncatalano@ofa.usRiverCityMedia0.345
16esmisc@mac.comModernBusinessSolutions0.345
17mike@fightforthefuture.orgRiverCityMedia0.344
18ncatalano@ofa.usLinkedIn0.339
19kurt@demandprogress.orgRiverCityMedia0.323
\n", + "
" + ], + "text/plain": [ + " uploader breach rate\n", + "0 shane@mediabridgellc.com ModernBusinessSolutions 0.942\n", + "1 shane@mediabridgellc.com RiverCityMedia 0.807\n", + "2 fccfreedom@hmamail.com VerificationsIO 0.782\n", + "3 shane@mediabridgellc.com VerificationsIO 0.743\n", + "4 fccfreedom@hmamail.com RiverCityMedia 0.645\n", + "5 esmisc@mac.com VerificationsIO 0.625\n", + "6 esmisc@mac.com RiverCityMedia 0.565\n", + "7 fccfreedom@hmamail.com ModernBusinessSolutions 0.466\n", + "8 ncatalano@ofa.us VerificationsIO 0.463\n", + "9 karen@momsrising.org VerificationsIO 0.459\n", + "10 mike@fightforthefuture.org VerificationsIO 0.435\n", + "11 meaghan@mandatemedia.com VerificationsIO 0.435\n", + "12 kurt@demandprogress.org VerificationsIO 0.412\n", + "13 karen@momsrising.org RiverCityMedia 0.377\n", + "14 meaghan@mandatemedia.com RiverCityMedia 0.364\n", + "15 ncatalano@ofa.us RiverCityMedia 0.345\n", + "16 esmisc@mac.com ModernBusinessSolutions 0.345\n", + "17 mike@fightforthefuture.org RiverCityMedia 0.344\n", + "18 ncatalano@ofa.us LinkedIn 0.339\n", + "19 kurt@demandprogress.org RiverCityMedia 0.323" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " calculate_breach_rates(\n", + " sample_17_108_bulk_uploads,\n", + " breaches_17_108_bulk_uploads,\n", + " )\n", + " .head(20)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Modern Business Solutions breaches only:" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
uploaderbreachrate
0shane@mediabridgellc.comModernBusinessSolutions0.942
7fccfreedom@hmamail.comModernBusinessSolutions0.466
16esmisc@mac.comModernBusinessSolutions0.345
116meaghan@mandatemedia.comModernBusinessSolutions0.114
121ncatalano@ofa.usModernBusinessSolutions0.106
130karen@momsrising.orgModernBusinessSolutions0.099
145kurt@demandprogress.orgModernBusinessSolutions0.087
150mike@fightforthefuture.orgModernBusinessSolutions0.086
\n", + "
" + ], + "text/plain": [ + " uploader breach rate\n", + "0 shane@mediabridgellc.com ModernBusinessSolutions 0.942\n", + "7 fccfreedom@hmamail.com ModernBusinessSolutions 0.466\n", + "16 esmisc@mac.com ModernBusinessSolutions 0.345\n", + "116 meaghan@mandatemedia.com ModernBusinessSolutions 0.114\n", + "121 ncatalano@ofa.us ModernBusinessSolutions 0.106\n", + "130 karen@momsrising.org ModernBusinessSolutions 0.099\n", + "145 kurt@demandprogress.org ModernBusinessSolutions 0.087\n", + "150 mike@fightforthefuture.org ModernBusinessSolutions 0.086" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " calculate_breach_rates(\n", + " sample_17_108_bulk_uploads,\n", + " breaches_17_108_bulk_uploads,\n", + " )\n", + " .loc[lambda df: df[\"breach\"] == \"ModernBusinessSolutions\"] \n", + " .head(20)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Larger 17-108 Media Bridge sample (10,000 addresses), for more precise rates" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datecommentsfileuploaderemail_address_nonstandardemail_addressemail_domainname_and_location
32365335/14/17Dear Chairman Pai, I am concerned about Inter...Batch-A4.csvshane@mediabridgellc.com044a8867c-3332-403f-9b34-560c054bd728gmail.com058cbe31-9c92-4509-a8d9-50f84a1cf1ae
27233705/15/17Dear Mr. Pai, Regarding the Obama takeover of...Batch-A2.csvshane@mediabridgellc.com0f9ad6e74-115c-4ccd-8a4e-f1e408423942icloud.com99c87532-c65e-42fa-97bd-0438d3ff504c
563375/14/17Chairman Pai: Hi, I'd like to comment on Titl...file-i.csvshane@mediabridgellc.com0ad374006-bbed-4c8b-932d-e7dfacce1a29aol.com6752ca9d-c848-4f3d-a66c-49073afe2458
59261805/16/2017The Title II order created a gaping gap in pri...batch-d-4.csvshane@mediabridgellc.com0c16e61f3-bd1f-4e50-95dd-830f5a219543gmail.comebccbc49-fa14-404f-bf83-b0ef00d48e78
101416905/15/2017Dear Chairman Pai, I'm very worried about Net...batch-b-5.csvshane@mediabridgellc.com0b252ca16-b5a2-4c61-9034-6e365bec0bebgmail.combc102bc2-d454-44a4-974d-d0b1a377f392
\n", + "
" + ], + "text/plain": [ + " date comments \\\n", + "3236533 5/14/17 Dear Chairman Pai, I am concerned about Inter... \n", + "2723370 5/15/17 Dear Mr. Pai, Regarding the Obama takeover of... \n", + "56337 5/14/17 Chairman Pai: Hi, I'd like to comment on Titl... \n", + "592618 05/16/2017 The Title II order created a gaping gap in pri... \n", + "1014169 05/15/2017 Dear Chairman Pai, I'm very worried about Net... \n", + "\n", + " file uploader email_address_nonstandard \\\n", + "3236533 Batch-A4.csv shane@mediabridgellc.com 0 \n", + "2723370 Batch-A2.csv shane@mediabridgellc.com 0 \n", + "56337 file-i.csv shane@mediabridgellc.com 0 \n", + "592618 batch-d-4.csv shane@mediabridgellc.com 0 \n", + "1014169 batch-b-5.csv shane@mediabridgellc.com 0 \n", + "\n", + " email_address email_domain \\\n", + "3236533 44a8867c-3332-403f-9b34-560c054bd728 gmail.com \n", + "2723370 f9ad6e74-115c-4ccd-8a4e-f1e408423942 icloud.com \n", + "56337 ad374006-bbed-4c8b-932d-e7dfacce1a29 aol.com \n", + "592618 c16e61f3-bd1f-4e50-95dd-830f5a219543 gmail.com \n", + "1014169 b252ca16-b5a2-4c61-9034-6e365bec0beb gmail.com \n", + "\n", + " name_and_location \n", + "3236533 058cbe31-9c92-4509-a8d9-50f84a1cf1ae \n", + "2723370 99c87532-c65e-42fa-97bd-0438d3ff504c \n", + "56337 6752ca9d-c848-4f3d-a66c-49073afe2458 \n", + "592618 ebccbc49-fa14-404f-bf83-b0ef00d48e78 \n", + "1014169 bc102bc2-d454-44a4-974d-d0b1a377f392 " + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_17_108_mb = (\n", + " bulk_uploads_17_108\n", + " .loc[lambda df: df[\"uploader\"] == \"shane@mediabridgellc.com\"]\n", + " .pipe(\n", + " create_sample,\n", + " n = 10000,\n", + " random_state = RANDOM_STATE + 1, # +1 so that we have an independent sample \n", + " )\n", + ")\n", + "\n", + "sample_17_108_mb.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
breachemail_address
0Edmodo44a8867c-3332-403f-9b34-560c054bd728
1ModernBusinessSolutions44a8867c-3332-403f-9b34-560c054bd728
2RiverCityMedia44a8867c-3332-403f-9b34-560c054bd728
3SpecialKSpamList44a8867c-3332-403f-9b34-560c054bd728
4VerificationsIO44a8867c-3332-403f-9b34-560c054bd728
\n", + "
" + ], + "text/plain": [ + " breach email_address\n", + "0 Edmodo 44a8867c-3332-403f-9b34-560c054bd728\n", + "1 ModernBusinessSolutions 44a8867c-3332-403f-9b34-560c054bd728\n", + "2 RiverCityMedia 44a8867c-3332-403f-9b34-560c054bd728\n", + "3 SpecialKSpamList 44a8867c-3332-403f-9b34-560c054bd728\n", + "4 VerificationsIO 44a8867c-3332-403f-9b34-560c054bd728" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "breaches_17_108_mb = (\n", + " sample_17_108_mb\n", + " .pipe(get_breaches, \"../data/breaches-17-108-mb-sample.csv\")\n", + ")\n", + "\n", + "breaches_17_108_mb.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
uploaderbreachrate
0shane@mediabridgellc.comModernBusinessSolutions0.9388
1shane@mediabridgellc.comRiverCityMedia0.8277
2shane@mediabridgellc.comVerificationsIO0.7651
3shane@mediabridgellc.comCollection10.2574
4shane@mediabridgellc.comExactis0.2571
5shane@mediabridgellc.comMySpace0.1968
6shane@mediabridgellc.comAntiPublic0.1956
7shane@mediabridgellc.comSpecialKSpamList0.1946
8shane@mediabridgellc.comOnlinerSpambot0.1941
9shane@mediabridgellc.comExploitIn0.1826
\n", + "
" + ], + "text/plain": [ + " uploader breach rate\n", + "0 shane@mediabridgellc.com ModernBusinessSolutions 0.9388\n", + "1 shane@mediabridgellc.com RiverCityMedia 0.8277\n", + "2 shane@mediabridgellc.com VerificationsIO 0.7651\n", + "3 shane@mediabridgellc.com Collection1 0.2574\n", + "4 shane@mediabridgellc.com Exactis 0.2571\n", + "5 shane@mediabridgellc.com MySpace 0.1968\n", + "6 shane@mediabridgellc.com AntiPublic 0.1956\n", + "7 shane@mediabridgellc.com SpecialKSpamList 0.1946\n", + "8 shane@mediabridgellc.com OnlinerSpambot 0.1941\n", + "9 shane@mediabridgellc.com ExploitIn 0.1826" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " calculate_breach_rates(\n", + " sample_17_108_mb,\n", + " breaches_17_108_mb,\n", + " )\n", + " .head(10)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Comparing MBS breach status to Docket 16-42 overlap" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
isin_mbsisin_16
3236533TrueFalse
2723370TrueFalse
56337TrueFalse
592618TrueFalse
1014169TrueFalse
\n", + "
" + ], + "text/plain": [ + " isin_mbs isin_16\n", + "3236533 True False\n", + "2723370 True False\n", + "56337 True False\n", + "592618 True False\n", + "1014169 True False" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_17_108_mb_comparison = (\n", + " sample_17_108_mb \n", + " .assign(\n", + " isin_mbs = lambda df: (\n", + " df\n", + " [\"email_address\"].isin(\n", + " breaches_17_108_mb\n", + " .loc[lambda df: df[\"breach\"] == \"ModernBusinessSolutions\"]\n", + " [\"email_address\"]\n", + " )\n", + " ),\n", + " isin_16 = lambda df: (\n", + " df\n", + " [\"name_and_location\"]\n", + " .isin(\n", + " comments_16_42_classified\n", + " .loc[lambda df: df[\"group\"] == \"American Commitment\"]\n", + " [\"name_and_location\"]\n", + " )\n", + " )\n", + " )\n", + " [[\n", + " \"isin_mbs\",\n", + " \"isin_16\",\n", + " ]]\n", + ")\n", + "\n", + "sample_17_108_mb_comparison.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Matrix of the 10,000-comment sample, by whether the email address exists in the Modern Business Solutions breach and whether the exact contact information shows up in the Docket 16-42 comments that used American Commitment's language:" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
isin_16FalseTrue
isin_mbs
False35577
True9287101
\n", + "
" + ], + "text/plain": [ + "isin_16 False True \n", + "isin_mbs \n", + "False 35 577\n", + "True 9287 101" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " sample_17_108_mb_comparison\n", + " .groupby([\n", + " \"isin_mbs\",\n", + " \"isin_16\",\n", + " ])\n", + " .size()\n", + " .unstack()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Among comments whose email addresses do *not* appear in MBS, this is the proportion that use exactly the same contact information as in the Docket 16-42 comments using American Commitment's language:" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9428" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " sample_17_108_mb_comparison\n", + " .loc[lambda df: df[\"isin_mbs\"] == False]\n", + " [\"isin_16\"]\n", + " .mean()\n", + " .round(4)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Among comments whose email addresses use exactly the same contact information as in the Docket 16-42 comments using American Commitment's language, this is the proportion of email addresses that appear in MBS:" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.149" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " sample_17_108_mb_comparison\n", + " .loc[lambda df: df[\"isin_16\"] == True]\n", + " [\"isin_mbs\"]\n", + " .mean()\n", + " .round(4)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is the proportion of comments that *either* are attributed to email addresses that appear in the Modern Business Solutions breach *or* use exactly the same contact information as in the Docket 16-42 comments using American Commitment's language:" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9965" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " sample_17_108_mb_comparison\n", + " [[\n", + " \"isin_mbs\",\n", + " \"isin_16\",\n", + " ]]\n", + " .sum(axis = 1)\n", + " .pipe(lambda x: x > 0)\n", + " .mean()\n", + " .round(4)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "---\n", + "\n", + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "04f19153d45345bea122f0226fd113c0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_838c964bbd7f4c8d9f2c6c74101b7910", + "IPY_MODEL_559e9ca4781549da93712e2012813f5e" + ], + "layout": "IPY_MODEL_337a0b4574d94a47ab32d58f8a7a5c61" + } + }, + "0b009166dfde4b37950925f2a90cb56f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "IntProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_57f0be729dfb476c82ef0deb5203147f", + "max": 10000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0d53cbe8cfc742329b594e489cce0177", + "value": 10000 + } + }, + "0d53cbe8cfc742329b594e489cce0177": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0d59382dc9a741a0877f290ffae90364": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "23b58dd86771493a96c5a267be77f946": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "285330ed3c324f5d912879140bfa7e4e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8468b9602d9f45eb8fac9eafaf2cc596", + "IPY_MODEL_bdf6f69aeeb145dd8ce850b2624b1ca7" + ], + "layout": "IPY_MODEL_6af89b4e9c64425593860aecdde16787" + } + }, + "2a49898891a14de69657f79c88ec06a0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_31d73c99639d4336a842a3711be1de7b", + "IPY_MODEL_bc97f74081834de09453bb2fbc424e53" + ], + "layout": "IPY_MODEL_2e816c2f00c241599adbc09f90286f2f" + } + }, + "2cf0ec36c28146b7b7911d8115fd9c08": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2e816c2f00c241599adbc09f90286f2f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "31d73c99639d4336a842a3711be1de7b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "IntProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_afa3bc7dc3ee45758b0d9edbae514f5a", + "max": 8000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_96183dd411c941fe95ee9fdf27b53e34", + "value": 8000 + } + }, + "337a0b4574d94a47ab32d58f8a7a5c61": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3f112798212e438c835992489e901c0c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_23b58dd86771493a96c5a267be77f946", + "placeholder": "​", + "style": "IPY_MODEL_44767ed751bf416e972ed642a3a5244c", + "value": "100% 10000/10000 [5:22:12<00:00, 1.93s/it]" + } + }, + "44767ed751bf416e972ed642a3a5244c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "559e9ca4781549da93712e2012813f5e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_87ae1a7a3c9b48f79f7a1cb1652f613c", + "placeholder": "​", + "style": "IPY_MODEL_c2a3af8a55c14d5fac09fd5d3670b9d6", + "value": "100% 2000/2000 [00:11<00:00, 169.13it/s]" + } + }, + "57f0be729dfb476c82ef0deb5203147f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6af89b4e9c64425593860aecdde16787": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "77429d4856194b3083b80bf4479caea2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7c6c63382e934b509d42b2f9c6f14540": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "805573d19962489a9ae73b224640c32d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "838c964bbd7f4c8d9f2c6c74101b7910": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "IntProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_edc354a8cb6b4c93ab80301bbbebffa6", + "max": 2000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_bf299a0f514048aa9b25df89637fa3bb", + "value": 2000 + } + }, + "8468b9602d9f45eb8fac9eafaf2cc596": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "IntProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9163fc6bd9c3420bb8bc911c78eb317c", + "max": 10000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e8c99f52be4b41e2b9b0421d46941707", + "value": 10000 + } + }, + "87ae1a7a3c9b48f79f7a1cb1652f613c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8cb5cf5fa780495ca62e9dd284a39539": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9163fc6bd9c3420bb8bc911c78eb317c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "96183dd411c941fe95ee9fdf27b53e34": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "af021a898708423b88f8418f69fad55e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0b009166dfde4b37950925f2a90cb56f", + "IPY_MODEL_3f112798212e438c835992489e901c0c" + ], + "layout": "IPY_MODEL_fd06e30d7c514c66a23889f9cee4c7f6" + } + }, + "afa3bc7dc3ee45758b0d9edbae514f5a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b39529fe3149414a8643320754f56f5f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_b7bd33b176c04f8fb4dc339a4d7ba001", + "IPY_MODEL_ea365edc947e4ffaafe4ade2c060d3a8" + ], + "layout": "IPY_MODEL_d44549eeeb85491fb80605c67ad820a4" + } + }, + "b7bd33b176c04f8fb4dc339a4d7ba001": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "IntProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_77429d4856194b3083b80bf4479caea2", + "max": 10000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0d59382dc9a741a0877f290ffae90364", + "value": 10000 + } + }, + "bc97f74081834de09453bb2fbc424e53": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2cf0ec36c28146b7b7911d8115fd9c08", + "placeholder": "​", + "style": "IPY_MODEL_7c6c63382e934b509d42b2f9c6f14540", + "value": "100% 8000/8000 [00:48<00:00, 163.55it/s]" + } + }, + "bdf6f69aeeb145dd8ce850b2624b1ca7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_805573d19962489a9ae73b224640c32d", + "placeholder": "​", + "style": "IPY_MODEL_f4dc764a214645daad6e3a8bc0ca5db3", + "value": "100% 10000/10000 [00:58<00:00, 169.89it/s]" + } + }, + "bf299a0f514048aa9b25df89637fa3bb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c2a3af8a55c14d5fac09fd5d3670b9d6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d44549eeeb85491fb80605c67ad820a4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e8c99f52be4b41e2b9b0421d46941707": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ea365edc947e4ffaafe4ade2c060d3a8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8cb5cf5fa780495ca62e9dd284a39539", + "placeholder": "​", + "style": "IPY_MODEL_eda854ec06cc44fea21cab1c9b6e0e16", + "value": "100% 10000/10000 [00:59<00:00, 168.73it/s]" + } + }, + "eda854ec06cc44fea21cab1c9b6e0e16": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "edc354a8cb6b4c93ab80301bbbebffa6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f4dc764a214645daad6e3a8bc0ca5db3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fd06e30d7c514c66a23889f9cee4c7f6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/analyze-mb-comment-structure.ipynb b/notebooks/analyze-mb-comment-structure.ipynb new file mode 100644 index 0000000..99aaf69 --- /dev/null +++ b/notebooks/analyze-mb-comment-structure.ipynb @@ -0,0 +1,2267 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analyzing the structure of Media Bridge–submitted comments\n", + "\n", + "This notebook analyzes the comments uploaded by Media Bridge to FCC Docket 17-108, with a focus on understanding the structure behind the algorithmically-generated ones." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load the comments" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re\n", + "import json\n", + "import math\n", + "from functools import reduce" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Media Bridge uploaded 1.9 million comments in total:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1856553" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mb_comments = (\n", + " pd.read_csv(\n", + " \"../data/bulk-uploads-17-108-with-uuids.csv\",\n", + " usecols = [ \"uploader\", \"comments\", \"email_address\" ],\n", + " dtype = str,\n", + " )\n", + " .loc[lambda df: df[\"uploader\"] == \"shane@mediabridgellc.com\"]\n", + " .assign(\n", + " comments = lambda df: df[\"comments\"].str.replace(u\"\\xa0\", \" \")\n", + " )\n", + ")\n", + "\n", + "len(mb_comments)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some, however, are duplicates. There are 1.5 million unique comments, where uniqueness is defined as the combination of the comment text and the email address associated with the comment:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1501759" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mb_deduped = (\n", + " mb_comments\n", + " .drop_duplicates(subset = [ \"comments\", \"email_address\" ])\n", + ")\n", + "\n", + "len(mb_deduped)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Separate randomized vs. non-randomized comments\n", + "\n", + "About 472,000 of the comments have no internal randomization; they come from one of five pre-written variations. (One of those five has two sub-variations that differ only in formattng; as a result, there are six strings listed below.)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "non_randomized = [\n", + " \"The Title II order created a gaping gap in privacy protections by taking the best cop, the FTC, off the beat. That is reason enough to support Chairman Pai's proposal to restore Internet freedom. Restore privacy by repealing Net Neutrality.\",\n", + " \"Title II is a Depression-era regulatory framework designed for a telephone monopoly that no longer exists. It was wrong to apply it to the Internet and the FCC should repeal it and go back to the free-market approach that worked so well.\",\n", + " \"The free-market Internet was an incredible engine of economic growth, innovation, and job creation since the 1990s and has already been substantially slowed by the 2015 Net Neutrality rules. The slowdown in investment is destroying jobs and risks a big future tax hike to make up for lost private investment. Save American jobs by repealing Net Neutrality.\",\n", + " \"The FCC's Net Neutrality rules were written in the Obama White House by political staff and Tech Industry special interests who overruled the FCC's own experts. The FCC's own chief economist Tim Brennan called the rules \\\"an economics-free zone.\\\" They should be repealed.\",\n", + " \"Obama's Net Neutrality order was the corrupt result of a corrupt process controlled by Silicon Valley special interests. It gives some of the biggest companies in the world a free ride at the expense of consumers and should be immediately repealed!\",\n", + " ' \"Obama\\'s Net Neutrality order was the corrupt result of a corrupt process controlled by Silicon Valley special interests. It gives some of the biggest companies in the world a free ride at the expense of consumers and should be immediately repealed!\"',\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "471677" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mb_deduped_nonrandom = (\n", + " mb_deduped\n", + " .loc[lambda df: df[\"comments\"].isin(non_randomized)]\n", + ")\n", + "\n", + "len(mb_deduped_nonrandom)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count
Title II is a Depression-era regulatory framework designed for a telephone monopoly that no longer exists. It was wrong to apply it to the Internet and the FCC should repeal it and go back to the free-market approach that worked so well.127501
The Title II order created a gaping gap in privacy protections by taking the best cop, the FTC, off the beat. That is reason enough to support Chairman Pai's proposal to restore Internet freedom. Restore privacy by repealing Net Neutrality.92884
The free-market Internet was an incredible engine of economic growth, innovation, and job creation since the 1990s and has already been substantially slowed by the 2015 Net Neutrality rules. The slowdown in investment is destroying jobs and risks a big future tax hike to make up for lost private investment. Save American jobs by repealing Net Neutrality.83072
Obama's Net Neutrality order was the corrupt result of a corrupt process controlled by Silicon Valley special interests. It gives some of the biggest companies in the world a free ride at the expense of consumers and should be immediately repealed!74809
The FCC's Net Neutrality rules were written in the Obama White House by political staff and Tech Industry special interests who overruled the FCC's own experts. The FCC's own chief economist Tim Brennan called the rules \"an economics-free zone.\" They should be repealed.62635
\"Obama's Net Neutrality order was the corrupt result of a corrupt process controlled by Silicon Valley special interests. It gives some of the biggest companies in the world a free ride at the expense of consumers and should be immediately repealed!\"30776
\n", + "
" + ], + "text/plain": [ + " count\n", + "Title II is a Depression-era regulatory framewo... 127501\n", + "The Title II order created a gaping gap in priv... 92884\n", + "The free-market Internet was an incredible engi... 83072\n", + "Obama's Net Neutrality order was the corrupt re... 74809\n", + "The FCC's Net Neutrality rules were written in ... 62635\n", + " \"Obama's Net Neutrality order was the corrupt ... 30776" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " mb_deduped_nonrandom\n", + " [\"comments\"]\n", + " .value_counts()\n", + " .to_frame(\"count\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The remaining 1 million comments are, at least on their surface, unique: No two are exactly the same." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1030082" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mb_deduped_random = (\n", + " mb_deduped\n", + " .loc[lambda df: ~df[\"comments\"].isin(non_randomized)]\n", + ")\n", + "\n", + "len(mb_deduped_random)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# If two or more comments were the same, this cell would throw an error\n", + "assert mb_deduped_random[\"comments\"].value_counts().max() == 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Examples:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dear Chairman Pai, I would like to comment on Internet regulation. I strongly recommend Chairman Pai to repeal Obama's scheme to regulate the web. Americans, as opposed to Washington bureaucrats, should purchase the products they prefer. Obama's scheme to regulate the web is a betrayal of the open Internet. It stopped a free-market system that functioned supremely well for decades with broad bipartisan backing.\n", + "\n", + "To the Federal Communications Commission: I'm concerned about network neutrality regulations. I'd like to request the government to undo The previous administration's order to control the web. Individual citizens, not the FCC, should enjoy whatever products they desire. The previous administration's order to control the web is a exploitation of net neutrality. It broke a market-based framework that functioned remarkably smoothly for many years with nearly universal backing.\n", + "\n", + "Chairman Pai: My comments re: regulations on the Internet. I'd like to suggest Ajit Pai to rescind Obama's scheme to take over the Internet. Internet users, rather than the FCC, should be free to purchase the products they choose. Obama's scheme to take over the Internet is a corruption of the open Internet. It stopped a free-market system that functioned very, very smoothly for decades with both parties' approval.\n" + ] + } + ], + "source": [ + "print(\"\\n\\n\".join(\n", + " mb_deduped_random\n", + " [\"comments\"]\n", + " .sample(3, random_state = 0)\n", + "))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reverse-engineer the structure of the randomized comments\n", + "\n", + "The following code represents BuzzFeed News' best estimate of how the randomized comments were generated.\n", + "\n", + "Each sub-list contains the possible variations, which appear to be selected (with equal weighting) at random. Sub-lists with only one item are \"fixed\"; they don't change from comment to comment.\n", + "\n", + "One exception is a repeated phrase at the beginning of the fourth sentence of each comment; it repeats whatever happens to have been randomly selected in a particular part of the second sentence. More details on that below." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "segments = [\n", + " [\n", + " \"To whom it may concern: \",\n", + " \"To the Federal Communications Commission: \",\n", + " \"FCC: \",\n", + " \"To the FCC: \",\n", + " \"Dear Commissioners: \",\n", + " \"Dear Mr. Pai, \",\n", + " \"Dear Chairman Pai, \",\n", + " \"Dear FCC, \",\n", + " \"Mr Pai: \",\n", + " \"FCC commissioners, \",\n", + " \"Chairman Pai: \",\n", + " \"\",\n", + " ],\n", + "\n", + " [\n", + " \"I'm concerned about\",\n", + " \"I am concerned about\",\n", + " \"I have concerns about\",\n", + " \"I'm very concerned about\",\n", + " \"I'd like to share my thoughts on\",\n", + " \"Hi, I'd like to comment on\",\n", + " \"I would like to comment on\",\n", + " \"I want to give my opinion on\",\n", + " \"I have thoughts on\",\n", + " \"I'm contacting you about\",\n", + " \"I'm very worried about\",\n", + " \"My comments re:\",\n", + " \"In reference to\",\n", + " \"I am a voter worried about\",\n", + " \"I'm a voter worried about\",\n", + " \"Regarding\",\n", + " \"With respect to\",\n", + " \"In the matter of\",\n", + " ],\n", + "\n", + " [ \" \" ],\n", + " \n", + " [\n", + " \"the FCC's so-called Open Internet order\",\n", + " \"Internet regulation and net neutrality\",\n", + " \"the Obama takeover of the Internet\", \n", + " \"the FCC regulations on the Internet\",\n", + " \"network neutrality regulations\",\n", + " \"the FCC's Open Internet order\",\n", + " \"the FCC rules on the Internet\",\n", + " \"net neutrality and Title II\",\n", + " \"Net Neutrality and Title II\",\n", + " \"regulations on the Internet\",\n", + " \"restoring Internet freedom\",\n", + " \"net neutrality regulations\",\n", + " \"Title 2 and net neutrality\",\n", + " \"the future of the Internet\",\n", + " \"the Open Internet order\",\n", + " \"internet regulations\",\n", + " \"net neutrality rules\",\n", + " \"Internet regulation\",\n", + " \"Network Neutrality\",\n", + " \"an open Internet\",\n", + " \"Internet freedom\",\n", + " \"Internet Freedom\",\n", + " \"Net neutrality\",\n", + " \"net neutrality\",\n", + " \"NET NEUTRALITY\",\n", + " \"Title II rules\",\n", + " ],\n", + " \n", + " [ \". I\" ],\n", + "\n", + " [\n", + " \"'d like to\",\n", + " \" would like to\",\n", + " \" want to\",\n", + " \" strongly\",\n", + " \"\",\n", + " ],\n", + " \n", + " [\n", + " \" \"\n", + " ],\n", + " \n", + " [\n", + " \"implore\",\n", + " \"ask\",\n", + " \"request\",\n", + " \"urge\",\n", + " \"encourage\",\n", + " \"recommend\",\n", + " \"suggest\",\n", + " \"demand\",\n", + " \"advocate\",\n", + " ],\n", + " \n", + " [ \" \" ],\n", + "\n", + " [\n", + " \"you\",\n", + " \"the FCC\",\n", + " \"the Federal Communications Commission\",\n", + " \"the commissioners\",\n", + " \"the commission\",\n", + " \"Chairman Pai\",\n", + " \"Ajit Pai\",\n", + " \"the government\"\n", + " ],\n", + " \n", + " [ \" to \" ],\n", + " \n", + " [\n", + " \"undo\",\n", + " \"reverse\",\n", + " \"repeal\",\n", + " \"overturn\",\n", + " \"rescind\",\n", + " ],\n", + "\n", + " [ \" \" ],\n", + " \n", + " [\n", + " \"The previous administration's\",\n", + " \"The Obama/Wheeler\",\n", + " \"President Obama's\",\n", + " \"Barack Obama's\",\n", + " \"Tom Wheeler's\",\n", + " \"Obama's\",\n", + " ],\n", + "\n", + " [ \" \" ],\n", + " \n", + " [\n", + " \"decision\",\n", + " \"scheme\",\n", + " \"policy\",\n", + " \"order\",\n", + " \"power grab\",\n", + " \"plan\",\n", + " ],\n", + " \n", + " [ \" to \" ],\n", + " \n", + " [\n", + " \"regulate\",\n", + " \"control\",\n", + " \"take over\",\n", + " ],\n", + "\n", + " [ \" \" ],\n", + "\n", + " \n", + " [\n", + " \"broadband\",\n", + " \"the web\",\n", + " \"Internet access\",\n", + " \"the Internet\",\n", + " ],\n", + " \n", + " [ \". \" ],\n", + " \n", + " [\n", + " \"Internet users\",\n", + " \"Individual citizens\",\n", + " \"People like me\",\n", + " \"Citizens\",\n", + " \"Individual Americans\",\n", + " \"Americans\",\n", + " \"Individuals\",\n", + " ],\n", + " \n", + " [ \", \" ],\n", + " \n", + " [\n", + " \"rather than\",\n", + " \"as opposed to\",\n", + " \"not\",\n", + " ],\n", + " \n", + " [ \" \" ],\n", + " \n", + " [\n", + " \"Washington bureaucrats\",\n", + " \"Washington\",\n", + " \"big government\",\n", + " \"so-called experts\",\n", + " \"unelected bureaucrats\",\n", + " \"the FCC Enforcement Bureau\",\n", + " \"the FCC\",\n", + " ],\n", + " \n", + " [ \", \" ],\n", + " \n", + " [\n", + " \"should be able to\",\n", + " \"should be empowered to\",\n", + " \"should be free to\",\n", + " \"ought to\",\n", + " \"deserve to\",\n", + " \"should\",\n", + " ],\n", + " \n", + " [\n", + " \" \",\n", + " ],\n", + " \n", + " [\n", + " \"use\",\n", + " \"enjoy\",\n", + " \"purchase\",\n", + " \"buy\",\n", + " \"select\",\n", + " ],\n", + " \n", + " [ \" \" ],\n", + " \n", + " [\n", + " \"the\",\n", + " \"whichever\",\n", + " \"whatever\",\n", + " \"which\",\n", + " ],\n", + " \n", + " [ \" \" ],\n", + " \n", + " [\n", + " \"products\",\n", + " \"applications\",\n", + " \"services\",\n", + " ],\n", + " \n", + " [ \" \" ],\n", + "\n", + " [\n", + " \"they\",\n", + " \"we\",\n", + " ],\n", + " \n", + " [ \" \" ],\n", + " \n", + " [\n", + " \"want\",\n", + " \"desire\",\n", + " \"prefer\",\n", + " \"choose\",\n", + " ],\n", + " \n", + " [ \". \" ],\n", + " \n", + " [\n", + " \"The previous administration's\",\n", + " \"The Obama/Wheeler\",\n", + " \"President Obama's\",\n", + " \"Barack Obama's\",\n", + " \"Tom Wheeler's\",\n", + " \"Obama's\",\n", + " ],\n", + "\n", + " [ \" \" ],\n", + " \n", + " [\n", + " \"decision\",\n", + " \"scheme\",\n", + " \"policy\",\n", + " \"order\",\n", + " \"power grab\",\n", + " \"plan\",\n", + " ],\n", + " \n", + " [ \" to \" ],\n", + " \n", + " [\n", + " \"regulate\",\n", + " \"control\",\n", + " \"take over\",\n", + " ],\n", + " \n", + " [ \" \" ],\n", + " \n", + " [\n", + " \"broadband\",\n", + " \"the web\",\n", + " \"Internet access\",\n", + " \"the Internet\",\n", + " ],\n", + " \n", + " [ \" is a \" ],\n", + " \n", + " [\n", + " \"exploitation \",\n", + " \"distortion\",\n", + " \"perversion\",\n", + " \"corruption\",\n", + " \"betrayal\",\n", + " ],\n", + " \n", + " [ \" of \" ],\n", + " \n", + " [\n", + " \"net neutrality\",\n", + " \"the open Internet\",\n", + " ],\n", + " \n", + " [ \". It \" ],\n", + " \n", + " [\n", + " \"disrupted\",\n", + " \"undid\",\n", + " \"reversed\",\n", + " \"ended\",\n", + " \"broke\",\n", + " \"stopped\",\n", + " ],\n", + " \n", + " [ \" a \" ],\n", + " \n", + " [\n", + " \"light-touch\",\n", + " \"pro-consumer\",\n", + " \"hands-off\",\n", + " \"free-market\",\n", + " \"market-based\",\n", + " ],\n", + " \n", + " [ \" \" ],\n", + " \n", + " [\n", + " \"policy\",\n", + " \"system\",\n", + " \"approach\",\n", + " \"framework\",\n", + " ],\n", + " \n", + " [ \" that \" ],\n", + " \n", + " [\n", + " \"functioned\",\n", + " \"performed\",\n", + " \"worked\",\n", + " ],\n", + " \n", + " [ \" \" ],\n", + " \n", + " [\n", + " \"supremely\",\n", + " \"very, very\",\n", + " \"very\",\n", + " \"remarkably\",\n", + " \"fabulously\",\n", + " \"exceptionally\",\n", + " ],\n", + " \n", + " [ \" \" ],\n", + " \n", + " [\n", + " \"well\",\n", + " \"successfully\",\n", + " \"smoothly\",\n", + " ],\n", + " \n", + " [ \" for \" ],\n", + " \n", + " [\n", + " \"many years\",\n", + " \"decades\",\n", + " \"a long time\",\n", + " \"two decades\",\n", + " ],\n", + " \n", + " [ \" with \" ],\n", + " \n", + " [\n", + " \"nearly universal\",\n", + " \"broad bipartisan\",\n", + " \"bipartisan\",\n", + " \"both parties'\",\n", + " \"Republican and Democrat\",\n", + " ],\n", + "\n", + " [ \" \" ],\n", + " \n", + " [\n", + " \"support\",\n", + " \"consensus\",\n", + " \"approval\",\n", + " \"backing\",\n", + " ],\n", + " \n", + " [ \".\" ]\n", + " \n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check that pattern fully matches comments\n", + "\n", + "Here, we compile the comment segments into a single regular expression, which we use to check whether comments match the reverse-engineered model." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def segments_to_pattern(segments):\n", + " return re.compile(r\"^\" + r\"\".join(\n", + " r\"(\" + r\"|\".join(re.escape(option) for option in seg) + r\")\"\n", + " for seg in segments) + r\"$\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "pattern = segments_to_pattern(segments)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All comments match (otherwise, the result would be greater than zero):" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(re.match(pattern, x) is None for x in mb_deduped_random[\"comments\"].values)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check that there are no superfluous permutations\n", + "\n", + "Although the model above succeeds in matching all comments, so would a model that contained, for example, the entire English language. So here we check whether any individual part of the pattern is superfluous, by incrementally removing each one, and seeing whether the comments still match the pattern. (Here we use a random sample of comments, to speed up the process.)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "sample_comments = (\n", + " mb_deduped_random\n", + " [\"comments\"]\n", + " .sample(1000, random_state = 0)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# A lack of output for this cell is a good thing;\n", + "# it means no part of the model is superfluous\n", + "\n", + "for i, segment in enumerate(segments):\n", + " # For each sub-part of the each segment ...\n", + " for j, option in enumerate(segment):\n", + " \n", + " # Replace the sub-part with \"###\", and then test\n", + " # whether the pattern-matching fails. It should fail;\n", + " # if it does not, then the sub-part is superfluous.\n", + " segments_copy = list([ list(o) for o in segments ])\n", + " segments_copy[i][j] = \"###\"\n", + " new_pattern = segments_to_pattern(segments_copy)\n", + " \n", + " num_nonmatching_comments = sum((re.match(new_pattern, x) is None)\n", + " for x in sample_comments.values)\n", + " \n", + " # If all of the comments still match after the \"###\" \n", + " # substitution, then the replaced sub-part isn't necessary\n", + " # to the model.\n", + " if num_nonmatching_comments == 0:\n", + " print(i, j, option)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check that segments are randomized independently\n", + "\n", + "In some text-generation models, the value of one segment may influence the possible values (or weights for those values) of subsequent segments. Here, we check whether that appears to be true for the actual model that generated these comments.\n", + "\n", + "First, we extract the bits of text that each comment has used for each section, skipping the \"fixed\" segments. (Here again we use a random sample of comments, to speed things up.)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[2, 4, 6]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "FIXED_SEGMENT_INDEX = [ i for i, x in enumerate(segments) if len(x) == 1 ]\n", + "FIXED_SEGMENT_INDEX[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_permutations(comment):\n", + " permutations = [ (i, g) for i, g in enumerate(re.match(pattern, comment).groups())\n", + " if i not in FIXED_SEGMENT_INDEX ]\n", + " \n", + " return pd.DataFrame(\n", + " permutations,\n", + " columns = [ \"seg_i\", \"option\" ], \n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Example, for the first comment in the sample:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seg_ioption
00Dear Chairman Pai,
11I would like to comment on
23Internet regulation
35strongly
47recommend
59Chairman Pai
611repeal
713Obama's
815scheme
917regulate
1019the web
1121Americans
1223as opposed to
1325Washington bureaucrats
1427should
1529purchase
1631the
1733products
1835they
1937prefer
2039Obama's
2141scheme
2243regulate
2345the web
2447betrayal
2549the open Internet
2651stopped
2753free-market
2855system
2957functioned
3059supremely
3161well
3263decades
3365broad bipartisan
3467backing
\n", + "
" + ], + "text/plain": [ + " seg_i option\n", + "0 0 Dear Chairman Pai, \n", + "1 1 I would like to comment on\n", + "2 3 Internet regulation\n", + "3 5 strongly\n", + "4 7 recommend\n", + "5 9 Chairman Pai\n", + "6 11 repeal\n", + "7 13 Obama's\n", + "8 15 scheme\n", + "9 17 regulate\n", + "10 19 the web\n", + "11 21 Americans\n", + "12 23 as opposed to\n", + "13 25 Washington bureaucrats\n", + "14 27 should\n", + "15 29 purchase\n", + "16 31 the\n", + "17 33 products\n", + "18 35 they\n", + "19 37 prefer\n", + "20 39 Obama's\n", + "21 41 scheme\n", + "22 43 regulate\n", + "23 45 the web\n", + "24 47 betrayal\n", + "25 49 the open Internet\n", + "26 51 stopped\n", + "27 53 free-market\n", + "28 55 system\n", + "29 57 functioned\n", + "30 59 supremely\n", + "31 61 well\n", + "32 63 decades\n", + "33 65 broad bipartisan\n", + "34 67 backing" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extract_permutations(sample_comments.iloc[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, we create a DataFrame of all extracted segments:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seg_i01357911131517...49515355575961636567
comment_i
0Dear Chairman Pai,I would like to comment onInternet regulationstronglyrecommendChairman PairepealObama'sschemeregulate...the open Internetstoppedfree-marketsystemfunctionedsupremelywelldecadesbroad bipartisanbacking
1To the Federal Communications Commission:I'm concerned aboutnetwork neutrality regulations'd like torequestthe governmentundoThe previous administration'sordercontrol...net neutralitybrokemarket-basedframeworkfunctionedremarkablysmoothlymany yearsnearly universalbacking
2Chairman Pai:My comments re:regulations on the Internet'd like tosuggestAjit PairescindObama'sschemetake over...the open Internetstoppedfree-marketsystemfunctionedvery, verysmoothlydecadesboth parties'approval
3Dear Mr. Pai,Hi, I'd like to comment onthe FCC rules on the InternetaskAjit PaireverseThe Obama/Wheelerschemeregulate...the open Internetreversedhands-offpolicyfunctionedremarkablysmoothlymany yearsRepublican and Democratconsensus
4Mr Pai:I'm contacting you aboutthe FCC's Open Internet orderrequestthe FCCrepealThe Obama/Wheelerplantake over...the open Internetreversedlight-touchsystemperformedvery, verysmoothlymany yearsRepublican and Democratbacking
\n", + "

5 rows × 35 columns

\n", + "
" + ], + "text/plain": [ + "seg_i 0 \\\n", + "comment_i \n", + "0 Dear Chairman Pai, \n", + "1 To the Federal Communications Commission: \n", + "2 Chairman Pai: \n", + "3 Dear Mr. Pai, \n", + "4 Mr Pai: \n", + "\n", + "seg_i 1 3 \\\n", + "comment_i \n", + "0 I would like to comment on Internet regulation \n", + "1 I'm concerned about network neutrality regulations \n", + "2 My comments re: regulations on the Internet \n", + "3 Hi, I'd like to comment on the FCC rules on the Internet \n", + "4 I'm contacting you about the FCC's Open Internet order \n", + "\n", + "seg_i 5 7 9 11 \\\n", + "comment_i \n", + "0 strongly recommend Chairman Pai repeal \n", + "1 'd like to request the government undo \n", + "2 'd like to suggest Ajit Pai rescind \n", + "3 ask Ajit Pai reverse \n", + "4 request the FCC repeal \n", + "\n", + "seg_i 13 15 17 ... \\\n", + "comment_i ... \n", + "0 Obama's scheme regulate ... \n", + "1 The previous administration's order control ... \n", + "2 Obama's scheme take over ... \n", + "3 The Obama/Wheeler scheme regulate ... \n", + "4 The Obama/Wheeler plan take over ... \n", + "\n", + "seg_i 49 51 53 55 57 \\\n", + "comment_i \n", + "0 the open Internet stopped free-market system functioned \n", + "1 net neutrality broke market-based framework functioned \n", + "2 the open Internet stopped free-market system functioned \n", + "3 the open Internet reversed hands-off policy functioned \n", + "4 the open Internet reversed light-touch system performed \n", + "\n", + "seg_i 59 61 63 65 \\\n", + "comment_i \n", + "0 supremely well decades broad bipartisan \n", + "1 remarkably smoothly many years nearly universal \n", + "2 very, very smoothly decades both parties' \n", + "3 remarkably smoothly many years Republican and Democrat \n", + "4 very, very smoothly many years Republican and Democrat \n", + "\n", + "seg_i 67 \n", + "comment_i \n", + "0 backing \n", + "1 backing \n", + "2 approval \n", + "3 consensus \n", + "4 backing \n", + "\n", + "[5 rows x 35 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extracted = (\n", + " pd.concat([ extract_permutations(x).assign(comment_i = i)\n", + " for i, x in enumerate(sample_comments) ])\n", + " .set_index([\n", + " \"comment_i\",\n", + " \"seg_i\",\n", + " ])\n", + " [\"option\"]\n", + " .unstack()\n", + ")\n", + "\n", + "extracted.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To test for the independence of randomization, we calculate the correlation between any two segments in a comment:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seg_aseg_bcorrseg_int_aseg_int_b
25080_1_Hi, I'd like to comment on0.03564001
25090_Chairman Pai:1_Hi, I'd like to comment on-0.04352301
25100_Dear Chairman Pai,1_Hi, I'd like to comment on-0.03041901
25110_Dear Commissioners:1_Hi, I'd like to comment on-0.04062401
25120_Dear FCC,1_Hi, I'd like to comment on-0.00550801
\n", + "
" + ], + "text/plain": [ + " seg_a seg_b corr \\\n", + "2508 0_ 1_Hi, I'd like to comment on 0.035640 \n", + "2509 0_Chairman Pai: 1_Hi, I'd like to comment on -0.043523 \n", + "2510 0_Dear Chairman Pai, 1_Hi, I'd like to comment on -0.030419 \n", + "2511 0_Dear Commissioners: 1_Hi, I'd like to comment on -0.040624 \n", + "2512 0_Dear FCC, 1_Hi, I'd like to comment on -0.005508 \n", + "\n", + " seg_int_a seg_int_b \n", + "2508 0 1 \n", + "2509 0 1 \n", + "2510 0 1 \n", + "2511 0 1 \n", + "2512 0 1 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "segment_correlations = (\n", + " # Turn each permutation into a dummy variable\n", + " extracted\n", + " .pipe(pd.get_dummies)\n", + " \n", + " # Calculate the correlations between them\n", + " .corr()\n", + " .reset_index()\n", + " .rename(columns = { \"index\": \"seg_a\" })\n", + " \n", + " # Melt the correlation matrix into a long/tidy DataFrame\n", + " .melt(\n", + " id_vars = [ \"seg_a\" ],\n", + " var_name = \"seg_b\",\n", + " value_name = \"corr\",\n", + " )\n", + " .assign(\n", + " seg_int_a = lambda df: df[\"seg_a\"].str.extract(r\"^(\\d+)\", expand = False).astype(int),\n", + " seg_int_b = lambda df: df[\"seg_b\"].str.extract(r\"^(\\d+)\", expand = False).astype(int),\n", + " )\n", + " \n", + " # Take only the first correlation (A•B instead of both A•B and B•A)\n", + " # and ignore self-correlations\n", + " .loc[lambda df: df[\"seg_a\"] < df[\"seg_b\"]]\n", + " \n", + " # Ignore correlations within the same segment, since they are mutually exclusive\n", + " .loc[lambda df: df[\"seg_int_a\"] != df[\"seg_int_b\"]]\n", + ")\n", + "\n", + "segment_correlations.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output below demonstrates that are only a handful of pairs with a correlation above 0.15; they are all perfect correlations, meaning that the first segment choice guarantees the second. In this case, whatever is chosen for segments `13-19` is repeated for segments `39-45`. (Segments 14, 16, etc. are all fixed segments, and don't vary at all.)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seg_aseg_bcorrseg_int_aseg_int_b
2997013_Barack Obama's39_Barack Obama's1.01339
3018013_Obama's39_Obama's1.01339
3039013_President Obama's39_President Obama's1.01339
3060013_The Obama/Wheeler39_The Obama/Wheeler1.01339
3081013_The previous administration's39_The previous administration's1.01339
3102013_Tom Wheeler's39_Tom Wheeler's1.01339
3123015_decision41_decision1.01541
3144015_order41_order1.01541
3165015_plan41_plan1.01541
3186015_policy41_policy1.01541
3207015_power grab41_power grab1.01541
3228015_scheme41_scheme1.01541
3249017_control43_control1.01743
3270017_regulate43_regulate1.01743
3291017_take over43_take over1.01743
3312019_Internet access45_Internet access1.01945
3333019_broadband45_broadband1.01945
3354019_the Internet45_the Internet1.01945
3375019_the web45_the web1.01945
\n", + "
" + ], + "text/plain": [ + " seg_a seg_b \\\n", + "29970 13_Barack Obama's 39_Barack Obama's \n", + "30180 13_Obama's 39_Obama's \n", + "30390 13_President Obama's 39_President Obama's \n", + "30600 13_The Obama/Wheeler 39_The Obama/Wheeler \n", + "30810 13_The previous administration's 39_The previous administration's \n", + "31020 13_Tom Wheeler's 39_Tom Wheeler's \n", + "31230 15_decision 41_decision \n", + "31440 15_order 41_order \n", + "31650 15_plan 41_plan \n", + "31860 15_policy 41_policy \n", + "32070 15_power grab 41_power grab \n", + "32280 15_scheme 41_scheme \n", + "32490 17_control 43_control \n", + "32700 17_regulate 43_regulate \n", + "32910 17_take over 43_take over \n", + "33120 19_Internet access 45_Internet access \n", + "33330 19_broadband 45_broadband \n", + "33540 19_the Internet 45_the Internet \n", + "33750 19_the web 45_the web \n", + "\n", + " corr seg_int_a seg_int_b \n", + "29970 1.0 13 39 \n", + "30180 1.0 13 39 \n", + "30390 1.0 13 39 \n", + "30600 1.0 13 39 \n", + "30810 1.0 13 39 \n", + "31020 1.0 13 39 \n", + "31230 1.0 15 41 \n", + "31440 1.0 15 41 \n", + "31650 1.0 15 41 \n", + "31860 1.0 15 41 \n", + "32070 1.0 15 41 \n", + "32280 1.0 15 41 \n", + "32490 1.0 17 43 \n", + "32700 1.0 17 43 \n", + "32910 1.0 17 43 \n", + "33120 1.0 19 45 \n", + "33330 1.0 19 45 \n", + "33540 1.0 19 45 \n", + "33750 1.0 19 45 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " segment_correlations\n", + " .loc[lambda df: df[\"corr\"] > 0.15]\n", + " .sort_values(\"seg_a\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output below demonstrates that no segment pairs with a correlation below -0.15, other than the possibilities inherently excluded by the perfect correlations above." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seg_aseg_bcorrseg_int_aseg_int_b
3101513_Barack Obama's39_Tom Wheeler's-0.2033661339
3080613_Barack Obama's39_The previous administration's-0.1917201339
3017913_Barack Obama's39_Obama's-0.2040861339
3059713_Barack Obama's39_The Obama/Wheeler-0.2040861339
3038813_Barack Obama's39_President Obama's-0.2004771339
..................
3333119_the Internet45_broadband-0.3277811945
3374919_the Internet45_the web-0.3372281945
3312319_the web45_Internet access-0.3381601945
3333219_the web45_broadband-0.3558641945
3354119_the web45_the Internet-0.3372281945
\n", + "

78 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " seg_a seg_b corr \\\n", + "31015 13_Barack Obama's 39_Tom Wheeler's -0.203366 \n", + "30806 13_Barack Obama's 39_The previous administration's -0.191720 \n", + "30179 13_Barack Obama's 39_Obama's -0.204086 \n", + "30597 13_Barack Obama's 39_The Obama/Wheeler -0.204086 \n", + "30388 13_Barack Obama's 39_President Obama's -0.200477 \n", + "... ... ... ... \n", + "33331 19_the Internet 45_broadband -0.327781 \n", + "33749 19_the Internet 45_the web -0.337228 \n", + "33123 19_the web 45_Internet access -0.338160 \n", + "33332 19_the web 45_broadband -0.355864 \n", + "33541 19_the web 45_the Internet -0.337228 \n", + "\n", + " seg_int_a seg_int_b \n", + "31015 13 39 \n", + "30806 13 39 \n", + "30179 13 39 \n", + "30597 13 39 \n", + "30388 13 39 \n", + "... ... ... \n", + "33331 19 45 \n", + "33749 19 45 \n", + "33123 19 45 \n", + "33332 19 45 \n", + "33541 19 45 \n", + "\n", + "[78 rows x 5 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " segment_correlations\n", + " .loc[lambda df: df[\"corr\"] < -0.15]\n", + " .sort_values(\"seg_a\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seg_aseg_bcorrseg_int_aseg_int_b
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [seg_a, seg_b, corr, seg_int_a, seg_int_b]\n", + "Index: []" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " segment_correlations\n", + " .loc[lambda df: df[\"corr\"] < -0.15]\n", + " .loc[lambda df: ~df[\"seg_int_a\"].isin([ 13, 15, 17, 19 ])]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Show the repeated segments" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Segments `13-19`:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\n", + " [\n", + " \"The previous administration's\",\n", + " \"The Obama/Wheeler\",\n", + " \"President Obama's\",\n", + " \"Barack Obama's\",\n", + " \"Tom Wheeler's\",\n", + " \"Obama's\"\n", + " ],\n", + " [\n", + " \" \"\n", + " ],\n", + " [\n", + " \"decision\",\n", + " \"scheme\",\n", + " \"policy\",\n", + " \"order\",\n", + " \"power grab\",\n", + " \"plan\"\n", + " ],\n", + " [\n", + " \" to \"\n", + " ],\n", + " [\n", + " \"regulate\",\n", + " \"control\",\n", + " \"take over\"\n", + " ],\n", + " [\n", + " \" \"\n", + " ],\n", + " [\n", + " \"broadband\",\n", + " \"the web\",\n", + " \"Internet access\",\n", + " \"the Internet\"\n", + " ]\n", + "]\n" + ] + } + ], + "source": [ + "print(json.dumps(segments[13:20], indent = 2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Segments `39-45`:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\n", + " [\n", + " \"The previous administration's\",\n", + " \"The Obama/Wheeler\",\n", + " \"President Obama's\",\n", + " \"Barack Obama's\",\n", + " \"Tom Wheeler's\",\n", + " \"Obama's\"\n", + " ],\n", + " [\n", + " \" \"\n", + " ],\n", + " [\n", + " \"decision\",\n", + " \"scheme\",\n", + " \"policy\",\n", + " \"order\",\n", + " \"power grab\",\n", + " \"plan\"\n", + " ],\n", + " [\n", + " \" to \"\n", + " ],\n", + " [\n", + " \"regulate\",\n", + " \"control\",\n", + " \"take over\"\n", + " ],\n", + " [\n", + " \" \"\n", + " ],\n", + " [\n", + " \"broadband\",\n", + " \"the web\",\n", + " \"Internet access\",\n", + " \"the Internet\"\n", + " ]\n", + "]\n" + ] + } + ], + "source": [ + "print(json.dumps(segments[39:46], indent = 2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Calculate possible permutations\n", + "\n", + "Below, we calculate the total possible permutations, with care to exclude the perfectly correlated segments (which we do by simply removing them from the calculation)." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_permutations(segments):\n", + " count = reduce(lambda x, y: x * y, map(len, segments))\n", + " print(f\"Total permutations: {count:,d}\")\n", + " \n", + " log = math.log10(count)\n", + " print(f\"Log10: {log:.2f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def remove_segments(segments, indices):\n", + " return [ s for i, s in enumerate(segments) if i not in indices ]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total permutations: 9,584,250,725,597,184,000,000\n", + "Log10: 21.98\n" + ] + } + ], + "source": [ + "calculate_permutations(remove_segments(segments, [ 39, 41, 43, 45 ]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "---\n", + "\n", + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}