diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d6e556f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,63 @@
+# Custom list:
+.ipynb_checkpoints
+.DS_Store
+
+#### joe made this: http://goel.io/joe
+
+#####=== Python ===#####
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..8778c61
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,17 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+
+[packages]
+requests = "*"
+pandas = "*"
+jupyter = "*"
+requests-cache = "*"
+tqdm = "*"
+nbexec = "*"
+
+[requires]
+python_version = "3.6"
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 0000000..3b9bf3c
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,495 @@
+{
+ "_meta": {
+ "hash": {
+ "sha256": "682395d97dfd62d238e9bd70cf5d6cab49754a43ce3d5acac41efd94b6c1ac6e"
+ },
+ "pipfile-spec": 6,
+ "requires": {
+ "python_version": "3.6"
+ },
+ "sources": [
+ {
+ "name": "pypi",
+ "url": "https://pypi.org/simple",
+ "verify_ssl": true
+ }
+ ]
+ },
+ "default": {
+ "appnope": {
+ "hashes": [
+ "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0",
+ "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71"
+ ],
+ "markers": "sys_platform == 'darwin'",
+ "version": "==0.1.0"
+ },
+ "attrs": {
+ "hashes": [
+ "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79",
+ "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"
+ ],
+ "version": "==19.1.0"
+ },
+ "backcall": {
+ "hashes": [
+ "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
+ "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
+ ],
+ "version": "==0.1.0"
+ },
+ "bleach": {
+ "hashes": [
+ "sha256:213336e49e102af26d9cde77dd2d0397afabc5a6bf2fed985dc35b5d1e285a16",
+ "sha256:3fdf7f77adcf649c9911387df51254b813185e32b2c6619f690b593a617e19fa"
+ ],
+ "version": "==3.1.0"
+ },
+ "certifi": {
+ "hashes": [
+ "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50",
+ "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef"
+ ],
+ "version": "==2019.9.11"
+ },
+ "chardet": {
+ "hashes": [
+ "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
+ "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
+ ],
+ "version": "==3.0.4"
+ },
+ "decorator": {
+ "hashes": [
+ "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de",
+ "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6"
+ ],
+ "version": "==4.4.0"
+ },
+ "defusedxml": {
+ "hashes": [
+ "sha256:6687150770438374ab581bb7a1b327a847dd9c5749e396102de3fad4e8a3ef93",
+ "sha256:f684034d135af4c6cbb949b8a4d2ed61634515257a67299e5f940fbaa34377f5"
+ ],
+ "version": "==0.6.0"
+ },
+ "entrypoints": {
+ "hashes": [
+ "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19",
+ "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"
+ ],
+ "version": "==0.3"
+ },
+ "idna": {
+ "hashes": [
+ "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
+ "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
+ ],
+ "version": "==2.8"
+ },
+ "ipykernel": {
+ "hashes": [
+ "sha256:167c3ef08450f5e060b76c749905acb0e0fbef9365899377a4a1eae728864383",
+ "sha256:b503913e0b4cce7ed2de965457dfb2edd633e8234161a60e23f2fe2161345d12"
+ ],
+ "version": "==5.1.2"
+ },
+ "ipython": {
+ "hashes": [
+ "sha256:c4ab005921641e40a68e405e286e7a1fcc464497e14d81b6914b4fd95e5dee9b",
+ "sha256:dd76831f065f17bddd7eaa5c781f5ea32de5ef217592cf019e34043b56895aa1"
+ ],
+ "markers": "python_version >= '3.3'",
+ "version": "==7.8.0"
+ },
+ "ipython-genutils": {
+ "hashes": [
+ "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
+ "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
+ ],
+ "version": "==0.2.0"
+ },
+ "ipywidgets": {
+ "hashes": [
+ "sha256:13ffeca438e0c0f91ae583dc22f50379b9d6b28390ac7be8b757140e9a771516",
+ "sha256:e945f6e02854a74994c596d9db83444a1850c01648f1574adf144fbbabe05c97"
+ ],
+ "version": "==7.5.1"
+ },
+ "jedi": {
+ "hashes": [
+ "sha256:786b6c3d80e2f06fd77162a07fed81b8baa22dde5d62896a790a331d6ac21a27",
+ "sha256:ba859c74fa3c966a22f2aeebe1b74ee27e2a462f56d3f5f7ca4a59af61bfe42e"
+ ],
+ "version": "==0.15.1"
+ },
+ "jinja2": {
+ "hashes": [
+ "sha256:065c4f02ebe7f7cf559e49ee5a95fb800a9e4528727aec6f24402a5374c65013",
+ "sha256:14dd6caf1527abb21f08f86c784eac40853ba93edb79552aa1e4b8aef1b61c7b"
+ ],
+ "version": "==2.10.1"
+ },
+ "jsonschema": {
+ "hashes": [
+ "sha256:5f9c0a719ca2ce14c5de2fd350a64fd2d13e8539db29836a86adc990bb1a068f",
+ "sha256:8d4a2b7b6c2237e0199c8ea1a6d3e05bf118e289ae2b9d7ba444182a2959560d"
+ ],
+ "version": "==3.0.2"
+ },
+ "jupyter": {
+ "hashes": [
+ "sha256:3e1f86076bbb7c8c207829390305a2b1fe836d471ed54be66a3b8c41e7f46cc7",
+ "sha256:5b290f93b98ffbc21c0c7e749f054b3267782166d72fa5e3ed1ed4eaf34a2b78",
+ "sha256:d9dc4b3318f310e34c82951ea5d6683f67bed7def4b259fafbfe4f1beb1d8e5f"
+ ],
+ "index": "pypi",
+ "version": "==1.0.0"
+ },
+ "jupyter-client": {
+ "hashes": [
+ "sha256:73a809a2964afa07adcc1521537fddb58c2ffbb7e84d53dc5901cf80480465b3",
+ "sha256:98e8af5edff5d24e4d31e73bc21043130ae9d955a91aa93fc0bc3b1d0f7b5880"
+ ],
+ "version": "==5.3.1"
+ },
+ "jupyter-console": {
+ "hashes": [
+ "sha256:308ce876354924fb6c540b41d5d6d08acfc946984bf0c97777c1ddcb42e0b2f5",
+ "sha256:cc80a97a5c389cbd30252ffb5ce7cefd4b66bde98219edd16bf5cb6f84bb3568"
+ ],
+ "version": "==6.0.0"
+ },
+ "jupyter-core": {
+ "hashes": [
+ "sha256:2c6e7c1e9f2ac45b5c2ceea5730bc9008d92fe59d0725eac57b04c0edfba24f7",
+ "sha256:f4fa22d6cf25f34807c995f22d2923693575c70f02557bcbfbe59bd5ec8d8b84"
+ ],
+ "version": "==4.5.0"
+ },
+ "markupsafe": {
+ "hashes": [
+ "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473",
+ "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161",
+ "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
+ "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
+ "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
+ "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
+ "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
+ "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
+ "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
+ "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66",
+ "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1",
+ "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
+ "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
+ "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
+ "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905",
+ "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735",
+ "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d",
+ "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e",
+ "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d",
+ "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c",
+ "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21",
+ "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2",
+ "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5",
+ "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b",
+ "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6",
+ "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f",
+ "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f",
+ "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7"
+ ],
+ "version": "==1.1.1"
+ },
+ "mistune": {
+ "hashes": [
+ "sha256:59a3429db53c50b5c6bcc8a07f8848cb00d7dc8bdb431a4ab41920d201d4756e",
+ "sha256:88a1051873018da288eee8538d476dffe1262495144b33ecb586c4ab266bb8d4"
+ ],
+ "version": "==0.8.4"
+ },
+ "nbconvert": {
+ "hashes": [
+ "sha256:427a468ec26e7d68a529b95f578d5cbf018cb4c1f889e897681c2b6d11897695",
+ "sha256:48d3c342057a2cf21e8df820d49ff27ab9f25fc72b8f15606bd47967333b2709"
+ ],
+ "version": "==5.6.0"
+ },
+ "nbexec": {
+ "hashes": [
+ "sha256:e367bac4a5c7cbd12e5b73b5e1011a4227b11d9e8e5a3811d8b8b9987eb235d0"
+ ],
+ "index": "pypi",
+ "version": "==0.0.2"
+ },
+ "nbformat": {
+ "hashes": [
+ "sha256:b9a0dbdbd45bb034f4f8893cafd6f652ea08c8c1674ba83f2dc55d3955743b0b",
+ "sha256:f7494ef0df60766b7cabe0a3651556345a963b74dbc16bc7c18479041170d402"
+ ],
+ "version": "==4.4.0"
+ },
+ "notebook": {
+ "hashes": [
+ "sha256:660976fe4fe45c7aa55e04bf4bccb9f9566749ff637e9020af3422f9921f9a5d",
+ "sha256:b0a290f5cc7792d50a21bec62b3c221dd820bf00efa916ce9aeec4b5354bde20"
+ ],
+ "version": "==6.0.1"
+ },
+ "numpy": {
+ "hashes": [
+ "sha256:05dbfe72684cc14b92568de1bc1f41e5f62b00f714afc9adee42f6311738091f",
+ "sha256:0d82cb7271a577529d07bbb05cb58675f2deb09772175fab96dc8de025d8ac05",
+ "sha256:10132aa1fef99adc85a905d82e8497a580f83739837d7cbd234649f2e9b9dc58",
+ "sha256:12322df2e21f033a60c80319c25011194cd2a21294cc66fee0908aeae2c27832",
+ "sha256:16f19b3aa775dddc9814e02a46b8e6ae6a54ed8cf143962b4e53f0471dbd7b16",
+ "sha256:3d0b0989dd2d066db006158de7220802899a1e5c8cf622abe2d0bd158fd01c2c",
+ "sha256:438a3f0e7b681642898fd7993d38e2bf140a2d1eafaf3e89bb626db7f50db355",
+ "sha256:5fd214f482ab53f2cea57414c5fb3e58895b17df6e6f5bca5be6a0bb6aea23bb",
+ "sha256:73615d3edc84dd7c4aeb212fa3748fb83217e00d201875a47327f55363cef2df",
+ "sha256:7bd355ad7496f4ce1d235e9814ec81ee3d28308d591c067ce92e49f745ba2c2f",
+ "sha256:7d077f2976b8f3de08a0dcf5d72083f4af5411e8fddacd662aae27baa2601196",
+ "sha256:a4092682778dc48093e8bda8d26ee8360153e2047826f95a3f5eae09f0ae3abf",
+ "sha256:b458de8624c9f6034af492372eb2fee41a8e605f03f4732f43fc099e227858b2",
+ "sha256:e70fc8ff03a961f13363c2c95ef8285e0cf6a720f8271836f852cc0fa64e97c8",
+ "sha256:ee8e9d7cad5fe6dde50ede0d2e978d81eafeaa6233fb0b8719f60214cf226578",
+ "sha256:f4a4f6aba148858a5a5d546a99280f71f5ee6ec8182a7d195af1a914195b21a2"
+ ],
+ "version": "==1.17.2"
+ },
+ "pandas": {
+ "hashes": [
+ "sha256:18d91a9199d1dfaa01ad645f7540370ba630bdcef09daaf9edf45b4b1bca0232",
+ "sha256:3f26e5da310a0c0b83ea50da1fd397de2640b02b424aa69be7e0784228f656c9",
+ "sha256:4182e32f4456d2c64619e97c58571fa5ca0993d1e8c2d9ca44916185e1726e15",
+ "sha256:426e590e2eb0e60f765271d668a30cf38b582eaae5ec9b31229c8c3c10c5bc21",
+ "sha256:5eb934a8f0dc358f0e0cdf314072286bbac74e4c124b64371395e94644d5d919",
+ "sha256:717928808043d3ea55b9bcde636d4a52d2236c246f6df464163a66ff59980ad8",
+ "sha256:8145f97c5ed71827a6ec98ceaef35afed1377e2d19c4078f324d209ff253ecb5",
+ "sha256:8744c84c914dcc59cbbb2943b32b7664df1039d99e834e1034a3372acb89ea4d",
+ "sha256:c1ac1d9590d0c9314ebf01591bd40d4c03d710bfc84a3889e5263c97d7891dee",
+ "sha256:cb2e197b7b0687becb026b84d3c242482f20cbb29a9981e43604eb67576da9f6",
+ "sha256:d4001b71ad2c9b84ff18b182cea22b7b6cbf624216da3ea06fb7af28d1f93165",
+ "sha256:d8930772adccb2882989ab1493fa74bd87d47c8ac7417f5dd3dd834ba8c24dc9",
+ "sha256:dfbb0173ee2399bc4ed3caf2d236e5c0092f948aafd0a15fbe4a0e77ee61a958",
+ "sha256:eebfbba048f4fa8ac711b22c78516e16ff8117d05a580e7eeef6b0c2be554c18",
+ "sha256:f1b21bc5cf3dbea53d33615d1ead892dfdae9d7052fa8898083bec88be20dcd2"
+ ],
+ "index": "pypi",
+ "version": "==0.25.1"
+ },
+ "pandocfilters": {
+ "hashes": [
+ "sha256:b3dd70e169bb5449e6bc6ff96aea89c5eea8c5f6ab5e207fc2f521a2cf4a0da9"
+ ],
+ "version": "==1.4.2"
+ },
+ "parso": {
+ "hashes": [
+ "sha256:63854233e1fadb5da97f2744b6b24346d2750b85965e7e399bec1620232797dc",
+ "sha256:666b0ee4a7a1220f65d367617f2cd3ffddff3e205f3f16a0284df30e774c2a9c"
+ ],
+ "version": "==0.5.1"
+ },
+ "pexpect": {
+ "hashes": [
+ "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1",
+ "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb"
+ ],
+ "markers": "sys_platform != 'win32'",
+ "version": "==4.7.0"
+ },
+ "pickleshare": {
+ "hashes": [
+ "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca",
+ "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"
+ ],
+ "version": "==0.7.5"
+ },
+ "prometheus-client": {
+ "hashes": [
+ "sha256:71cd24a2b3eb335cb800c7159f423df1bd4dcd5171b234be15e3f31ec9f622da"
+ ],
+ "version": "==0.7.1"
+ },
+ "prompt-toolkit": {
+ "hashes": [
+ "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780",
+ "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1",
+ "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55"
+ ],
+ "version": "==2.0.9"
+ },
+ "ptyprocess": {
+ "hashes": [
+ "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
+ "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
+ ],
+ "markers": "os_name != 'nt'",
+ "version": "==0.6.0"
+ },
+ "pygments": {
+ "hashes": [
+ "sha256:71e430bc85c88a430f000ac1d9b331d2407f681d6f6aec95e8bcfbc3df5b0127",
+ "sha256:881c4c157e45f30af185c1ffe8d549d48ac9127433f2c380c24b84572ad66297"
+ ],
+ "version": "==2.4.2"
+ },
+ "pyrsistent": {
+ "hashes": [
+ "sha256:34b47fa169d6006b32e99d4b3c4031f155e6e68ebcc107d6454852e8e0ee6533"
+ ],
+ "version": "==0.15.4"
+ },
+ "python-dateutil": {
+ "hashes": [
+ "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb",
+ "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e"
+ ],
+ "version": "==2.8.0"
+ },
+ "pytz": {
+ "hashes": [
+ "sha256:26c0b32e437e54a18161324a2fca3c4b9846b74a8dccddd843113109e1116b32",
+ "sha256:c894d57500a4cd2d5c71114aaab77dbab5eabd9022308ce5ac9bb93a60a6f0c7"
+ ],
+ "version": "==2019.2"
+ },
+ "pyzmq": {
+ "hashes": [
+ "sha256:01636e95a88d60118479041c6aaaaf5419c6485b7b1d37c9c4dd424b7b9f1121",
+ "sha256:021dba0d1436516092c624359e5da51472b11ba8edffa334218912f7e8b65467",
+ "sha256:0463bd941b6aead494d4035f7eebd70035293dd6caf8425993e85ad41de13fa3",
+ "sha256:05fd51edd81eed798fccafdd49c936b6c166ffae7b32482e4d6d6a2e196af4e6",
+ "sha256:1fadc8fbdf3d22753c36d4172169d184ee6654f8d6539e7af25029643363c490",
+ "sha256:22efa0596cf245a78a99060fe5682c4cd00c58bb7614271129215c889062db80",
+ "sha256:260c70b7c018905ec3659d0f04db735ac830fe27236e43b9dc0532cf7c9873ef",
+ "sha256:2762c45e289732d4450406cedca35a9d4d71e449131ba2f491e0bf473e3d2ff2",
+ "sha256:2fc6cada8dc53521c1189596f1898d45c5f68603194d3a6453d6db4b27f4e12e",
+ "sha256:343b9710a61f2b167673bea1974e70b5dccfe64b5ed10626798f08c1f7227e72",
+ "sha256:41bf96d5f554598a0632c3ec28e3026f1d6591a50f580df38eff0b8067efb9e7",
+ "sha256:856b2cdf7a1e2cbb84928e1e8db0ea4018709b39804103d3a409e5584f553f57",
+ "sha256:85b869abc894672de9aecdf032158ea8ad01e2f0c3b09ef60e3687fb79418096",
+ "sha256:93f44739db69234c013a16990e43db1aa0af3cf5a4b8b377d028ff24515fbeb3",
+ "sha256:98fa3e75ccb22c0dc99654e3dd9ff693b956861459e8c8e8734dd6247b89eb29",
+ "sha256:9a22c94d2e93af8bebd4fcf5fa38830f5e3b1ff0d4424e2912b07651eb1bafb4",
+ "sha256:a7d3f4b4bbb5d7866ae727763268b5c15797cbd7b63ea17f3b0ec1067da8994b",
+ "sha256:b645a49376547b3816433a7e2d2a99135c8e651e50497e7ecac3bd126e4bea16",
+ "sha256:cf0765822e78cf9e45451647a346d443f66792aba906bc340f4e0ac7870c169c",
+ "sha256:dc398e1e047efb18bfab7a8989346c6921a847feae2cad69fedf6ca12fb99e2c",
+ "sha256:dd5995ae2e80044e33b5077fb4bc2b0c1788ac6feaf15a6b87a00c14b4bdd682",
+ "sha256:e03fe5e07e70f245dc9013a9d48ae8cc4b10c33a1968039c5a3b64b5d01d083d",
+ "sha256:ea09a306144dff2795e48439883349819bef2c53c0ee62a3c2fae429451843bb",
+ "sha256:f4e37f33da282c3c319849877e34f97f0a3acec09622ec61b7333205bdd13b52",
+ "sha256:fa4bad0d1d173dee3e8ef3c3eb6b2bb6c723fc7a661eeecc1ecb2fa99860dd45"
+ ],
+ "version": "==18.1.0"
+ },
+ "qtconsole": {
+ "hashes": [
+ "sha256:40d5d8e00d070ea266dbf6f0da74c4b9597b8b8d67cd8233c3ffd8debf923703",
+ "sha256:b91e7412587e6cfe1644696538f73baf5611e837be5406633218443b2827c6d9"
+ ],
+ "version": "==4.5.5"
+ },
+ "requests": {
+ "hashes": [
+ "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
+ "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
+ ],
+ "index": "pypi",
+ "version": "==2.22.0"
+ },
+ "requests-cache": {
+ "hashes": [
+ "sha256:813023269686045f8e01e2289cc1e7e9ae5ab22ddd1e2849a9093ab3ab7270eb",
+ "sha256:81e13559baee64677a7d73b85498a5a8f0639e204517b5d05ff378e44a57831a"
+ ],
+ "index": "pypi",
+ "version": "==0.5.2"
+ },
+ "send2trash": {
+ "hashes": [
+ "sha256:60001cc07d707fe247c94f74ca6ac0d3255aabcb930529690897ca2a39db28b2",
+ "sha256:f1691922577b6fa12821234aeb57599d887c4900b9ca537948d2dac34aea888b"
+ ],
+ "version": "==1.5.0"
+ },
+ "six": {
+ "hashes": [
+ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+ "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ ],
+ "version": "==1.12.0"
+ },
+ "terminado": {
+ "hashes": [
+ "sha256:d9d012de63acb8223ac969c17c3043337c2fcfd28f3aea1ee429b345d01ef460",
+ "sha256:de08e141f83c3a0798b050ecb097ab6259c3f0331b2f7b7750c9075ced2c20c2"
+ ],
+ "version": "==0.8.2"
+ },
+ "testpath": {
+ "hashes": [
+ "sha256:46c89ebb683f473ffe2aab0ed9f12581d4d078308a3cb3765d79c6b2317b0109",
+ "sha256:b694b3d9288dbd81685c5d2e7140b81365d46c29f5db4bc659de5aa6b98780f8"
+ ],
+ "version": "==0.4.2"
+ },
+ "tornado": {
+ "hashes": [
+ "sha256:349884248c36801afa19e342a77cc4458caca694b0eda633f5878e458a44cb2c",
+ "sha256:398e0d35e086ba38a0427c3b37f4337327231942e731edaa6e9fd1865bbd6f60",
+ "sha256:4e73ef678b1a859f0cb29e1d895526a20ea64b5ffd510a2307b5998c7df24281",
+ "sha256:559bce3d31484b665259f50cd94c5c28b961b09315ccd838f284687245f416e5",
+ "sha256:abbe53a39734ef4aba061fca54e30c6b4639d3e1f59653f0da37a0003de148c7",
+ "sha256:c845db36ba616912074c5b1ee897f8e0124df269468f25e4fe21fe72f6edd7a9",
+ "sha256:c9399267c926a4e7c418baa5cbe91c7d1cf362d505a1ef898fde44a07c9dd8a5"
+ ],
+ "version": "==6.0.3"
+ },
+ "tqdm": {
+ "hashes": [
+ "sha256:1be3e4e3198f2d0e47b928e9d9a8ec1b63525db29095cec1467f4c5a4ea8ebf9",
+ "sha256:7e39a30e3d34a7a6539378e39d7490326253b7ee354878a92255656dc4284457"
+ ],
+ "index": "pypi",
+ "version": "==4.35.0"
+ },
+ "traitlets": {
+ "hashes": [
+ "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
+ "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
+ ],
+ "version": "==4.3.2"
+ },
+ "urllib3": {
+ "hashes": [
+ "sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1",
+ "sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232"
+ ],
+ "version": "==1.25.3"
+ },
+ "wcwidth": {
+ "hashes": [
+ "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+ "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+ ],
+ "version": "==0.1.7"
+ },
+ "webencodings": {
+ "hashes": [
+ "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78",
+ "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"
+ ],
+ "version": "==0.5.1"
+ },
+ "widgetsnbextension": {
+ "hashes": [
+ "sha256:079f87d87270bce047512400efd70238820751a11d2d8cb137a5a5bdbaf255c7",
+ "sha256:bd314f8ceb488571a5ffea6cc5b9fc6cba0adaf88a9d2386b93a489751938bcd"
+ ],
+ "version": "==3.5.1"
+ }
+ },
+ "develop": {}
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d59aabe
--- /dev/null
+++ b/README.md
@@ -0,0 +1,117 @@
+# Analysis of comments submitted to three FCC public dockets
+
+This repository contains data, code, and methodology supporting [BuzzFeed News' analysis of comments submitted to three Federal Communications Commission (FCC) dockets](https://www.buzzfeednews.com/article/jsvine/net-neutrality-fcc-fake-comments-impersonation), published October 3, 2019:
+
+- 17-108 ("Restoring Internet Freedom")
+- 16-42 ("Expanding Consumers' Video Navigation Choices")
+- 14-28 ("Protecting and Promoting the Open Internet")
+
+Please see below for further details.
+
+## Data Sources
+
+The data in this repository comes from several sources:
+
+### The FCC's Electronic Comment Filing System (ECFS)
+
+The ECFS is the FCC's public portal for searching and accessing comments submitted to the commission's dockets. BuzzFeed News used the website to download each individually-listed comment, for two of the dockets: [14-28](https://www.fcc.gov/ecfs/search/filings?date_disseminated=%5Bgte%5D2014-02-21%5Blte%5D2016-01-01&proceedings_name=14-28&sort=date_disseminated,ASC_description=COMMENT) and [16-42](https://www.fcc.gov/ecfs/search/filings?date_disseminated=%5Bgte%5D2016-02-23%5Blte%5D2018-10-01&proceedings_name=16-42&sort=date_disseminated,ASC&submissiontype_description=COMMENT). __Note__: Not all comments submitted to the FCC are individually listed; in some cases, an organization will submit a consolidated set of comments as a PDF, with signatures and/or commenters' information listed in that PDF. Because of the extraordinary variety and inconsistency of those files, BuzzFeed News did not disaggregate those comments.
+
+### The FCC's bulk download of Docket 17-108 comments
+
+On November 7, 2017, [the FCC released](https://ecfsapi.fcc.gov/file/11073095518421/DA-17-1089A1_Rcd.pdf) a "complete set of [Docket 17-108] filings submitted as of November 3, 2017"; BuzzFeed News used this download to examine docket-wide trends.
+
+### Bulk uploads to Docket 17-108, via FOIA
+
+In response to two FOIA requests, the FCC provided to BuzzFeed News the files submitted to the agency's [bulk-upload system for Docket 17-108](https://www.fcc.gov/restoring-internet-freedom-comments-wc-docket-no-17-108), plus associated metadata indicating the uploader's Box.com account and the time of the upload. According to the FCC, it provided all such files submitted. Although the agency provided a template for the uploads, some of the files — typically the smallest ones, containing just one comment each — do not conform to them and could not be incorporated easily. Those comments, which represent an exceedingly small percentage of all bulk-uploaded comments, have not been included in this repository's data; in many cases, the corresponding comments appear also not to have been added to the FCC's public comment portal. In certain other cases, the upload files use non-standard column names. In cases where the intention appeared to be clear, BuzzFeed News fixed the column names and included the data.
+
+### haveibeenpwned.com
+
+[Have I Been Pwned](https://haveibeenpwned.com/) is a website and service that identifies whether any given email address has been exposed in any of hundreds of major data breaches. BuzzFeed News used [HIBP's application programming interface](https://haveibeenpwned.com/API/v3) to determine the most common breaches associated with various groups of email addresses.
+
+## Personal Information Minimization
+
+Because it appears that many of the comments in the data above were submitted without the consent of the named commenters, we have taken the following steps:
+
+- Removing all raw personal-information columns (name, physical address, etc.).
+
+- Replacing each distinct email address with a randomly-assigned unique identifier. (Specifically, a [version 4 UUID](https://www.cryptosys.net/pki/uuid-rfc4122.html).)
+
+- Replacing each distinct email domain with a similar randomly-assigned unique identifier, except for very common domains. (Specifically the 36 domains that are associated with 10,000 or more unique email addresses in the Docket 17-108 comments.)
+
+- Replacing each distinct combination of name + location (first line of street address, city, state, ZIP code) with another UUID. Before converting to UUIDs, ZIP codes are converted to zero-padded five-digit representations, and all strings are lowercased. For instance: `John Doe, 123 Smith Street, New York, NY 01111` will receive the same UUID as `john doe, 123 SMITH STREET, New York, ny 1111`, but neither will match submissions that put him at `123 Smith St.` (with the abbreviation).
+
+## Data Files
+
+The process above produces the files listed below. Several are too large to host on GitHub, so BuzzFeed News has [uploaded them here](https://archive.org/details/fcc-comments-and-bulk-uploads).
+
+### Comment data
+
+These files contain selected fields from the comment data listed above:
+
+- `bulk-uploads-17-108-with-uuids.csv`: Docket 17-108 bulk uploads, via FOIA
+- `comments-17-108-with-uuids.csv`: Docket 17-108, via FCC official download
+- `comments-14-28-with-uuids.csv`: Docket 14-28, via FCC online portal
+- `comments-16-42-with-uuids.csv`: Docket 16-42, via FCC online portal
+
+They contain the following columns:
+
+- `date`: The date of submission.
+- `id_submission`: The ID the FCC has assigned to the comment. __Note__: Not available in `bulk-uploads-17-108-with-uuids.csv`, because the FCC assigns the IDs *after* they are uploaded.
+- `comments`: The text of the comment. __Note__: This is sometimes modified by the FCC, for example by adding a filename or, as appears to be the case for some Docket 14-28 comments, removing boilerplate language.) __Note__: Not included in `comments-17-108-with-uuids.csv` for file-size considerations, because this file is mainly used for domain-counts.
+- `name_and_location`: The UUID (see above) corresponding to the name and adress information provided with the comment. __Note__: Not included in `comments-17-108-with-uuids.csv`.
+- `email_address`: The UUID (see above) corresponding to the email address provided with the comment. __Note__: In the FCC's commenting system, you don't have to control an email address to list it as the author of a comment.
+- `email_address_nonstandard`: If the email address contains nonstandard characters (such as `%`) or formatting (such as lacking an `@` symbol), this value will be `1`; otherwise, it will be `0`. This is used to filter out likely-invalid addresses before checking them on Have I Been Pwned.
+- `email_domain`: The domain of the email address, as a UUID unless it is one of the 36 domains described above.
+
+Additionally, `bulk-uploads-17-108-with-uuids.csv` contains the following columns:
+
+- `file`: The name of the file in which the comment was uploaded.
+- `uploader`: The email address associated with the Box.com account that uploaded the file.
+
+### Breach data
+
+These files list the breaches, per Have I Been Pwned, for email addresses in a randomized samples of the comments bulk-uplaoded to Docket 17-108:
+
+- `breaches-17-108-bulk-uploads-sample.csv`: 1,000-address sample of each of the eight bulk-uploaders whose Docket 17-108 uploads contained at least 10,000 unique email addresses.
+- `breaches-17-108-mb-sample.csv`: 10,000-address sample of Media Bridge's Docket 17-108 bulk-uploads.
+
+
+They contain the following columns:
+
+- `email_address`: The UUID (see above) corresponding to the email address examined.
+- `breach`: The name of the breach, [as returned by Have I Been Pwned](https://haveibeenpwned.com/API/v3).
+
+## Analysis
+
+The [`analyze-fcc-comments` notebook](notebooks/analyze-fcc-comments.ipynb) examines comments submitted to the three FCC dockets described above, the language used in them, the timing of their submission. For Docket 17-108, the notebook also examines the email domains associated with the comments, as well as rates at which the email addresses in the bulk uploads overlap with those exposed in major data breaches. The notebook also examines the overlap between the contact information in Docket 16-42 and Docket 17-108.
+
+The [`analyze-mb-comment-structure` notebook](notebooks/analyze-mb-comment-structure.ipynb) examines the phrasing of the comments that Media Bridge submitted to Docket 17-108, and attempts to reverse-engineer the comments that use randomly-generated text.
+
+## Reproducibility
+
+The code running the analysis is written in Python 3, and requires the following Python libraries:
+
+- [jupyter](https://jupyter.org/) to run the notebook infrastructure
+- [pandas](https://pandas.pydata.org/) for data loading and analysis
+
+If you would like to reuse the code for fetching data from Have I Been Pwned's API, you will also need these Python libraries:
+
+- [requests](https://2.python-requests.org/en/master/) for HTTP requests
+- [requests-cache](https://requests-cache.readthedocs.io/en/latest/) for caching HTTP requests
+- [tqdm](https://tqdm.github.io) for progress bars
+
+If you use Pipenv, you can install all required libraries with `pipenv install`.
+
+As noted above, you will need to download the source data separately. Save the folder as this repository's `/data` directory.
+
+Execute the notebooks in the `notebooks/` directory to reproduce the findings.
+
+## Licensing
+
+All code in this repository is available under the [MIT License](https://opensource.org/licenses/MIT).
+
+## Questions / Feedback
+
+Contact Jeremy Singer-Vine at [jeremy.singer-vine@buzzfeed.com](mailto:jeremy.singer-vine@buzzfeed.com).
+
+Looking for more from BuzzFeed News? [Click here for a list of our open-sourced projects, data, and code.](https://github.com/BuzzFeedNews/everything)
diff --git a/data/.keep b/data/.keep
new file mode 100644
index 0000000..e69de29
diff --git a/notebooks/analyze-fcc-comments.ipynb b/notebooks/analyze-fcc-comments.ipynb
new file mode 100644
index 0000000..b51f557
--- /dev/null
+++ b/notebooks/analyze-fcc-comments.ipynb
@@ -0,0 +1,5491 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Analysis of comments in three FCC dockets\n",
+ "\n",
+ "This notebook contains Python code that runs the following steps:\n",
+ "\n",
+ "- Loading the four comment datasets under analysis (published comments for FCC dockets 14-28, 16-42, 17-108, plus bulk-uploaded comments for docket 17-108).\n",
+ "\n",
+ "\n",
+ "- Classifying the comments for dockets 14-28 and 16-42, based on the language used in them.\n",
+ "\n",
+ "\n",
+ "- Examining:\n",
+ "\n",
+ " - How often email addresses the 17-108 bulk uploads appear in data breaches identified by [Have I Been Pwned](https://haveibeenpwned.com/)\n",
+ "\n",
+ " - The overlap between comments in docket 16-42 and bulk-uploaded comments in docket 17-108\n",
+ "\n",
+ " - The comments attributed to Annie Reeves vis-a-vis the timing and language used in American Commitment's docket 14-28 and docket 16-42 mass-comment campaigns.\n",
+ " \n",
+ "__Please see this repository's landing page and associated BuzzFeed News article (linked on the landing page) for context before continuing.__"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Import Python libraries and set key variables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Standard libraries\n",
+ "import os\n",
+ "import sys\n",
+ "import time\n",
+ "import re\n",
+ "\n",
+ "# External libraries\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Change this to True if you plan to reuse this notebook\n",
+ "# and want to make HTTP requests to Have I Been Pwned's API\n",
+ "\n",
+ "MAKE_HTTP_REQUESTS = False\n",
+ "\n",
+ "if MAKE_HTTP_REQUESTS:\n",
+ " from tqdm.auto import tqdm\n",
+ " import requests\n",
+ " import requests_cache\n",
+ " \n",
+ " # This is the API key for Have I Been Pwned\n",
+ " HIBP_KEY = open(\"../hibp-key.txt\").read().strip()\n",
+ " \n",
+ " # Enables graphical progress bars when fetching HIPB data\n",
+ " tqdm.pandas()\n",
+ " \n",
+ " # For caching HTTP requests\n",
+ " requests_cache.install_cache(\n",
+ " \"../hibp-requests-cache\",\n",
+ " allowable_codes = (200, 404),\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "BASE_PATH = \"../data/\"\n",
+ "\n",
+ "# In the sampling procedures below, we use this \"random state\"\n",
+ "# to make the samples reproducible. \n",
+ "RANDOM_STATE = 0"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Load comments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def load_comments(path, **kwargs):\n",
+ " return (\n",
+ " pd.read_csv(\n",
+ " path,\n",
+ " dtype = str,\n",
+ " **kwargs\n",
+ " )\n",
+ " .astype({\n",
+ " \"email_address_nonstandard\": int\n",
+ " })\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Docket 17-108, bulk uploads"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " date | \n",
+ " comments | \n",
+ " file | \n",
+ " uploader | \n",
+ " email_address_nonstandard | \n",
+ " email_address | \n",
+ " email_domain | \n",
+ " name_and_location | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 5/8/2017 | \n",
+ " Dear FCC, I am am writing today to SUPPORT net... | \n",
+ " ecfs-input-template-17-108 (209).csv | \n",
+ " kathleenkintz@gmail.com | \n",
+ " 0 | \n",
+ " 9f664e24-96aa-4d96-b453-24d926658b47 | \n",
+ " gmail.com | \n",
+ " 5100f64f-b025-467f-9aa6-0100fa615ae6 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 12/31/2017 | \n",
+ " Dear FCC, I am writing you today because I spe... | \n",
+ " ecfs-input-template-17-108 (120).csv | \n",
+ " vgboy522@gmail.com | \n",
+ " 0 | \n",
+ " 818761bf-4c51-4970-95e6-11b01bac631f | \n",
+ " gmail.com | \n",
+ " dda3bd6b-9ad2-42d0-af15-f12c0b8a9354 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 5/16/17 | \n",
+ " Obama's Federal Communications Commission (FCC... | \n",
+ " TPA_3911_2017526.csv | \n",
+ " esmisc@mac.com | \n",
+ " 0 | \n",
+ " f2cf802f-0c01-4d1f-b28f-0efef2a053ba | \n",
+ " hotmail.com | \n",
+ " d9b96c36-796e-45d1-97d8-00647ae09d89 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 5/16/17 | \n",
+ " Obama's Federal Communications Commission (FCC... | \n",
+ " TPA_3911_2017526.csv | \n",
+ " esmisc@mac.com | \n",
+ " 0 | \n",
+ " 6966ae39-6da6-4a47-a1ec-7dc854030634 | \n",
+ " gmail.com | \n",
+ " f6d75f39-e952-41ff-b7a9-3d86da811496 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5/16/17 | \n",
+ " Obama's Federal Communications Commission (FCC... | \n",
+ " TPA_3911_2017526.csv | \n",
+ " esmisc@mac.com | \n",
+ " 0 | \n",
+ " 610afa24-f0df-44ff-b621-f545d371efab | \n",
+ " gmail.com | \n",
+ " 1b3050d5-6f3a-495e-a67f-b3b61040fe02 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " date comments \\\n",
+ "0 5/8/2017 Dear FCC, I am am writing today to SUPPORT net... \n",
+ "1 12/31/2017 Dear FCC, I am writing you today because I spe... \n",
+ "2 5/16/17 Obama's Federal Communications Commission (FCC... \n",
+ "3 5/16/17 Obama's Federal Communications Commission (FCC... \n",
+ "4 5/16/17 Obama's Federal Communications Commission (FCC... \n",
+ "\n",
+ " file uploader \\\n",
+ "0 ecfs-input-template-17-108 (209).csv kathleenkintz@gmail.com \n",
+ "1 ecfs-input-template-17-108 (120).csv vgboy522@gmail.com \n",
+ "2 TPA_3911_2017526.csv esmisc@mac.com \n",
+ "3 TPA_3911_2017526.csv esmisc@mac.com \n",
+ "4 TPA_3911_2017526.csv esmisc@mac.com \n",
+ "\n",
+ " email_address_nonstandard email_address \\\n",
+ "0 0 9f664e24-96aa-4d96-b453-24d926658b47 \n",
+ "1 0 818761bf-4c51-4970-95e6-11b01bac631f \n",
+ "2 0 f2cf802f-0c01-4d1f-b28f-0efef2a053ba \n",
+ "3 0 6966ae39-6da6-4a47-a1ec-7dc854030634 \n",
+ "4 0 610afa24-f0df-44ff-b621-f545d371efab \n",
+ "\n",
+ " email_domain name_and_location \n",
+ "0 gmail.com 5100f64f-b025-467f-9aa6-0100fa615ae6 \n",
+ "1 gmail.com dda3bd6b-9ad2-42d0-af15-f12c0b8a9354 \n",
+ "2 hotmail.com d9b96c36-796e-45d1-97d8-00647ae09d89 \n",
+ "3 gmail.com f6d75f39-e952-41ff-b7a9-3d86da811496 \n",
+ "4 gmail.com 1b3050d5-6f3a-495e-a67f-b3b61040fe02 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bulk_uploads_17_108 = load_comments(BASE_PATH + \"bulk-uploads-17-108-with-uuids.csv\")\n",
+ "\n",
+ "bulk_uploads_17_108.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Examine bulk-uploader metrics for 17-108"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " submissions | \n",
+ " unique_emails | \n",
+ " prop_with_email | \n",
+ "
\n",
+ " \n",
+ " uploader | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " esmisc@mac.com | \n",
+ " 4347979 | \n",
+ " 3966016 | \n",
+ " 1.0000 | \n",
+ "
\n",
+ " \n",
+ " shane@mediabridgellc.com | \n",
+ " 1856553 | \n",
+ " 1501145 | \n",
+ " 1.0000 | \n",
+ "
\n",
+ " \n",
+ " mike@fightforthefuture.org | \n",
+ " 1464423 | \n",
+ " 129682 | \n",
+ " 0.2464 | \n",
+ "
\n",
+ " \n",
+ " karen@momsrising.org | \n",
+ " 1069368 | \n",
+ " 17870 | \n",
+ " 0.0362 | \n",
+ "
\n",
+ " \n",
+ " dutch@freepress.net | \n",
+ " 528607 | \n",
+ " 3 | \n",
+ " 0.0000 | \n",
+ "
\n",
+ " \n",
+ " kurt@demandprogress.org | \n",
+ " 412792 | \n",
+ " 290372 | \n",
+ " 1.0000 | \n",
+ "
\n",
+ " \n",
+ " fccfreedom@hmamail.com | \n",
+ " 207007 | \n",
+ " 122252 | \n",
+ " 1.0000 | \n",
+ "
\n",
+ " \n",
+ " advocacy@mozilla.com | \n",
+ " 82926 | \n",
+ " 0 | \n",
+ " 0.0000 | \n",
+ "
\n",
+ " \n",
+ " action@aclu.org | \n",
+ " 48733 | \n",
+ " 0 | \n",
+ " 0.0000 | \n",
+ "
\n",
+ " \n",
+ " meaghan@mandatemedia.com | \n",
+ " 17317 | \n",
+ " 16267 | \n",
+ " 1.0000 | \n",
+ "
\n",
+ " \n",
+ " ncatalano@ofa.us | \n",
+ " 12230 | \n",
+ " 12230 | \n",
+ " 1.0000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " submissions unique_emails prop_with_email\n",
+ "uploader \n",
+ "esmisc@mac.com 4347979 3966016 1.0000\n",
+ "shane@mediabridgellc.com 1856553 1501145 1.0000\n",
+ "mike@fightforthefuture.org 1464423 129682 0.2464\n",
+ "karen@momsrising.org 1069368 17870 0.0362\n",
+ "dutch@freepress.net 528607 3 0.0000\n",
+ "kurt@demandprogress.org 412792 290372 1.0000\n",
+ "fccfreedom@hmamail.com 207007 122252 1.0000\n",
+ "advocacy@mozilla.com 82926 0 0.0000\n",
+ "action@aclu.org 48733 0 0.0000\n",
+ "meaghan@mandatemedia.com 17317 16267 1.0000\n",
+ "ncatalano@ofa.us 12230 12230 1.0000"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "uploader_metrics = (\n",
+ " bulk_uploads_17_108\n",
+ " .assign(\n",
+ " prop_with_email = lambda df: df[\"email_address\"].notnull()\n",
+ " )\n",
+ " .groupby(\"uploader\")\n",
+ " .pipe(lambda grp: pd.DataFrame({\n",
+ " \"submissions\": grp.size(),\n",
+ " \"unique_emails\": grp[\"email_address\"].nunique(), \n",
+ " \"prop_with_email\": grp[\"prop_with_email\"].mean().round(4),\n",
+ " }))\n",
+ ")\n",
+ " \n",
+ "(\n",
+ " uploader_metrics\n",
+ " .sort_values(\"submissions\", ascending = False)\n",
+ " .loc[lambda df: df[\"submissions\"] >= 10000]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Docket 17-108, all comments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id_submission | \n",
+ " date | \n",
+ " email_address_nonstandard | \n",
+ " email_address | \n",
+ " email_domain | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 04272972619149 | \n",
+ " 2017-04-27 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0427547924954 | \n",
+ " 2017-04-27 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 10427918117987 | \n",
+ " 2017-04-27 | \n",
+ " 0 | \n",
+ " 1f4830aa-726c-4206-9bef-cb3f2a57bb20 | \n",
+ " gmail.com | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 10427080530667 | \n",
+ " 2017-04-27 | \n",
+ " 0 | \n",
+ " f10d9c2b-2c98-44c2-9c7a-fe57b96930d8 | \n",
+ " gmail.com | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1042709110034 | \n",
+ " 2017-04-27 | \n",
+ " 0 | \n",
+ " a6609a29-4b4c-4857-9e42-a886f61b8aaa | \n",
+ " d6b158e4-d116-4944-ab88-73091f1fc465 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id_submission date email_address_nonstandard \\\n",
+ "0 04272972619149 2017-04-27 0 \n",
+ "1 0427547924954 2017-04-27 0 \n",
+ "2 10427918117987 2017-04-27 0 \n",
+ "3 10427080530667 2017-04-27 0 \n",
+ "4 1042709110034 2017-04-27 0 \n",
+ "\n",
+ " email_address email_domain \n",
+ "0 NaN NaN \n",
+ "1 NaN NaN \n",
+ "2 1f4830aa-726c-4206-9bef-cb3f2a57bb20 gmail.com \n",
+ "3 f10d9c2b-2c98-44c2-9c7a-fe57b96930d8 gmail.com \n",
+ "4 a6609a29-4b4c-4857-9e42-a886f61b8aaa d6b158e4-d116-4944-ab88-73091f1fc465 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "comments_17_108 = (\n",
+ " load_comments(BASE_PATH + \"comments-17-108-with-uuids.csv\")\n",
+ " .assign(date = lambda df: df[\"date\"].str.slice(0, 10))\n",
+ ")\n",
+ "\n",
+ "comments_17_108.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Examine email domains attributed to these comments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " count | \n",
+ " unique_addresses | \n",
+ "
\n",
+ " \n",
+ " email_domain | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " gmail.com | \n",
+ " 5000687 | \n",
+ " 4160788 | \n",
+ "
\n",
+ " \n",
+ " yahoo.com | \n",
+ " 2536892 | \n",
+ " 2126544 | \n",
+ "
\n",
+ " \n",
+ " hotmail.com | \n",
+ " 673018 | \n",
+ " 571156 | \n",
+ "
\n",
+ " \n",
+ " aol.com | \n",
+ " 632971 | \n",
+ " 508087 | \n",
+ "
\n",
+ " \n",
+ " pornhub.com | \n",
+ " 1030003 | \n",
+ " 233516 | \n",
+ "
\n",
+ " \n",
+ " comcast.net | \n",
+ " 208512 | \n",
+ " 158939 | \n",
+ "
\n",
+ " \n",
+ " icloud.com | \n",
+ " 106442 | \n",
+ " 91091 | \n",
+ "
\n",
+ " \n",
+ " msn.com | \n",
+ " 110056 | \n",
+ " 89398 | \n",
+ "
\n",
+ " \n",
+ " hurra.de | \n",
+ " 363357 | \n",
+ " 88571 | \n",
+ "
\n",
+ " \n",
+ " outlook.com | \n",
+ " 79411 | \n",
+ " 67890 | \n",
+ "
\n",
+ " \n",
+ " att.net | \n",
+ " 79823 | \n",
+ " 60640 | \n",
+ "
\n",
+ " \n",
+ " live.com | \n",
+ " 70139 | \n",
+ " 59210 | \n",
+ "
\n",
+ " \n",
+ " sbcglobal.net | \n",
+ " 70126 | \n",
+ " 51206 | \n",
+ "
\n",
+ " \n",
+ " yahoo.fr | \n",
+ " 93389 | \n",
+ " 48034 | \n",
+ "
\n",
+ " \n",
+ " ymail.com | \n",
+ " 45036 | \n",
+ " 37515 | \n",
+ "
\n",
+ " \n",
+ " bellsouth.net | \n",
+ " 40432 | \n",
+ " 32155 | \n",
+ "
\n",
+ " \n",
+ " cox.net | \n",
+ " 40137 | \n",
+ " 31260 | \n",
+ "
\n",
+ " \n",
+ " verizon.net | \n",
+ " 41933 | \n",
+ " 29236 | \n",
+ "
\n",
+ " \n",
+ " yahoo.de | \n",
+ " 97977 | \n",
+ " 28310 | \n",
+ "
\n",
+ " \n",
+ " mail.ru | \n",
+ " 60608 | \n",
+ " 24570 | \n",
+ "
\n",
+ " \n",
+ " me.com | \n",
+ " 26000 | \n",
+ " 19559 | \n",
+ "
\n",
+ " \n",
+ " charter.net | \n",
+ " 24425 | \n",
+ " 18487 | \n",
+ "
\n",
+ " \n",
+ " einrot.com | \n",
+ " 793148 | \n",
+ " 17091 | \n",
+ "
\n",
+ " \n",
+ " gustr.com | \n",
+ " 769010 | \n",
+ " 16813 | \n",
+ "
\n",
+ " \n",
+ " rhyta.com | \n",
+ " 773757 | \n",
+ " 16756 | \n",
+ "
\n",
+ " \n",
+ " jourrapide.com | \n",
+ " 782650 | \n",
+ " 16746 | \n",
+ "
\n",
+ " \n",
+ " armyspy.com | \n",
+ " 780664 | \n",
+ " 16741 | \n",
+ "
\n",
+ " \n",
+ " dayrep.com | \n",
+ " 770023 | \n",
+ " 16733 | \n",
+ "
\n",
+ " \n",
+ " superrito.com | \n",
+ " 767495 | \n",
+ " 16684 | \n",
+ "
\n",
+ " \n",
+ " teleworm.us | \n",
+ " 765488 | \n",
+ " 16673 | \n",
+ "
\n",
+ " \n",
+ " cuvox.de | \n",
+ " 775904 | \n",
+ " 16623 | \n",
+ "
\n",
+ " \n",
+ " fleckens.hu | \n",
+ " 776092 | \n",
+ " 16600 | \n",
+ "
\n",
+ " \n",
+ " mail.com | \n",
+ " 16392 | \n",
+ " 14657 | \n",
+ "
\n",
+ " \n",
+ " rocketmail.com | \n",
+ " 17112 | \n",
+ " 14266 | \n",
+ "
\n",
+ " \n",
+ " windstream.net | \n",
+ " 13496 | \n",
+ " 11107 | \n",
+ "
\n",
+ " \n",
+ " earthlink.net | \n",
+ " 18068 | \n",
+ " 11088 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " count unique_addresses\n",
+ "email_domain \n",
+ "gmail.com 5000687 4160788\n",
+ "yahoo.com 2536892 2126544\n",
+ "hotmail.com 673018 571156\n",
+ "aol.com 632971 508087\n",
+ "pornhub.com 1030003 233516\n",
+ "comcast.net 208512 158939\n",
+ "icloud.com 106442 91091\n",
+ "msn.com 110056 89398\n",
+ "hurra.de 363357 88571\n",
+ "outlook.com 79411 67890\n",
+ "att.net 79823 60640\n",
+ "live.com 70139 59210\n",
+ "sbcglobal.net 70126 51206\n",
+ "yahoo.fr 93389 48034\n",
+ "ymail.com 45036 37515\n",
+ "bellsouth.net 40432 32155\n",
+ "cox.net 40137 31260\n",
+ "verizon.net 41933 29236\n",
+ "yahoo.de 97977 28310\n",
+ "mail.ru 60608 24570\n",
+ "me.com 26000 19559\n",
+ "charter.net 24425 18487\n",
+ "einrot.com 793148 17091\n",
+ "gustr.com 769010 16813\n",
+ "rhyta.com 773757 16756\n",
+ "jourrapide.com 782650 16746\n",
+ "armyspy.com 780664 16741\n",
+ "dayrep.com 770023 16733\n",
+ "superrito.com 767495 16684\n",
+ "teleworm.us 765488 16673\n",
+ "cuvox.de 775904 16623\n",
+ "fleckens.hu 776092 16600\n",
+ "mail.com 16392 14657\n",
+ "rocketmail.com 17112 14266\n",
+ "windstream.net 13496 11107\n",
+ "earthlink.net 18068 11088"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "email_domains_17_108 = (\n",
+ " comments_17_108\n",
+ " .groupby([ \"email_domain\" ])\n",
+ " .pipe(lambda grp: pd.DataFrame({\n",
+ " \"count\": grp.size(),\n",
+ " \"unique_addresses\": grp[\"email_address\"].nunique()\n",
+ " }))\n",
+ " .sort_values([ \"count\", \"unique_addresses\" ], ascending = False)\n",
+ ")\n",
+ "\n",
+ "(\n",
+ " email_domains_17_108\n",
+ " .loc[lambda df: df[\"unique_addresses\"] >= 10000]\n",
+ " .sort_values(\"unique_addresses\", ascending = False)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Here we count the comments and unique email addresses associated with FakeMailGenerator.com:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "FAKEMAIL_DOMAINS = [\n",
+ " \"einrot.com\",\n",
+ " \"jourrapide.com\",\n",
+ " \"armyspy.com\",\n",
+ " \"fleckens.hu\",\n",
+ " \"cuvox.de\",\n",
+ " \"rhyta.com\",\n",
+ " \"dayrep.com\",\n",
+ " \"gustr.com\",\n",
+ " \"superrito.com\",\n",
+ " \"teleworm.us\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 7754231 | \n",
+ "
\n",
+ " \n",
+ " unique_addresses | \n",
+ " 167460 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " count\n",
+ "count 7754231\n",
+ "unique_addresses 167460"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " email_domains_17_108\n",
+ " .loc[FAKEMAIL_DOMAINS]\n",
+ " [[\"count\", \"unique_addresses\"]]\n",
+ " .sum()\n",
+ " .to_frame(\"count\")\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Docket 16-42"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " date | \n",
+ " id_submission | \n",
+ " comments | \n",
+ " email_address_nonstandard | \n",
+ " email_address | \n",
+ " email_domain | \n",
+ " name_and_location | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2016-02-19 | \n",
+ " 60001483702 | \n",
+ " 60001515146.txtThank you! Very pleased to see... | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 8ad10c4e-1354-42ba-83f1-be6b3c89f331 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2016-02-22 | \n",
+ " 60001484317 | \n",
+ " 60001843102.txt[5/23/2016 7:55:30 PM]The excha... | \n",
+ " 0 | \n",
+ " 310d8308-43a0-4b84-93dc-6662acdef829 | \n",
+ " gmail.com | \n",
+ " d984ccab-11bb-4994-bcfe-f0d407fd03b5 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2016-02-25 | \n",
+ " 60001486876 | \n",
+ " 60001518518.txtPlease eliminate the cable TV b... | \n",
+ " 0 | \n",
+ " 7e6087df-a7a2-414f-8ebb-3be229805bec | \n",
+ " yahoo.com | \n",
+ " 298bb4d9-8130-4ced-86e9-6a5d0c740c66 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2016-02-27 | \n",
+ " 60001489444 | \n",
+ " I?support?the?FCC?allowing?homeowners?to?be?fr... | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 12614861-1a8f-4313-aeff-2366bcf18ca8 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2016-02-29 | \n",
+ " 60001492083 | \n",
+ " 60001523826.txtAs a consumer, I agree with the... | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 10268573-9386-42c7-ab31-4d76641e76ed | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " date id_submission \\\n",
+ "0 2016-02-19 60001483702 \n",
+ "1 2016-02-22 60001484317 \n",
+ "2 2016-02-25 60001486876 \n",
+ "3 2016-02-27 60001489444 \n",
+ "4 2016-02-29 60001492083 \n",
+ "\n",
+ " comments \\\n",
+ "0 60001515146.txtThank you! Very pleased to see... \n",
+ "1 60001843102.txt[5/23/2016 7:55:30 PM]The excha... \n",
+ "2 60001518518.txtPlease eliminate the cable TV b... \n",
+ "3 I?support?the?FCC?allowing?homeowners?to?be?fr... \n",
+ "4 60001523826.txtAs a consumer, I agree with the... \n",
+ "\n",
+ " email_address_nonstandard email_address \\\n",
+ "0 0 NaN \n",
+ "1 0 310d8308-43a0-4b84-93dc-6662acdef829 \n",
+ "2 0 7e6087df-a7a2-414f-8ebb-3be229805bec \n",
+ "3 0 NaN \n",
+ "4 0 NaN \n",
+ "\n",
+ " email_domain name_and_location \n",
+ "0 NaN 8ad10c4e-1354-42ba-83f1-be6b3c89f331 \n",
+ "1 gmail.com d984ccab-11bb-4994-bcfe-f0d407fd03b5 \n",
+ "2 yahoo.com 298bb4d9-8130-4ced-86e9-6a5d0c740c66 \n",
+ "3 NaN 12614861-1a8f-4313-aeff-2366bcf18ca8 \n",
+ "4 NaN 10268573-9386-42c7-ab31-4d76641e76ed "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "comments_16_42 = (\n",
+ " load_comments(BASE_PATH + \"comments-16-42-with-uuids.csv\")\n",
+ " .assign(date = lambda df: df[\"date\"].str.slice(0, 10))\n",
+ ") \n",
+ "\n",
+ "comments_16_42.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Docket 14-28"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " date | \n",
+ " id_submission | \n",
+ " comments | \n",
+ " email_address_nonstandard | \n",
+ " email_address | \n",
+ " email_domain | \n",
+ " name_and_location | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2014-02-21 | \n",
+ " 6017589853 | \n",
+ " 7521074305.txt Reclassify The Internet As A Co... | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " a0fad65b-1482-427d-b300-da8e63d14272 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2014-02-21 | \n",
+ " 6017589866 | \n",
+ " 7521074318.txt Reclassify The Internet As A Co... | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 45954c7c-d52d-48f0-a252-343b4f82e509 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2014-02-21 | \n",
+ " 6017589903 | \n",
+ " 7521074355.txt Reclassify The Internet As A Co... | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0a1dea8a-3ae6-434f-be03-4b6447c3190c | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2014-02-21 | \n",
+ " 6017589904 | \n",
+ " 7521074356.txt Reclassify The Internet As A Co... | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 5cb3b14f-71a4-4763-8bd1-40ad440c5eb8 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2014-02-21 | \n",
+ " 6017589924 | \n",
+ " 7521074376.txt Reclassify The Internet As A Co... | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0de47b78-a256-4e36-93ad-1f4830b07c48 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " date id_submission \\\n",
+ "0 2014-02-21 6017589853 \n",
+ "1 2014-02-21 6017589866 \n",
+ "2 2014-02-21 6017589903 \n",
+ "3 2014-02-21 6017589904 \n",
+ "4 2014-02-21 6017589924 \n",
+ "\n",
+ " comments \\\n",
+ "0 7521074305.txt Reclassify The Internet As A Co... \n",
+ "1 7521074318.txt Reclassify The Internet As A Co... \n",
+ "2 7521074355.txt Reclassify The Internet As A Co... \n",
+ "3 7521074356.txt Reclassify The Internet As A Co... \n",
+ "4 7521074376.txt Reclassify The Internet As A Co... \n",
+ "\n",
+ " email_address_nonstandard email_address email_domain \\\n",
+ "0 0 NaN NaN \n",
+ "1 0 NaN NaN \n",
+ "2 0 NaN NaN \n",
+ "3 0 NaN NaN \n",
+ "4 0 NaN NaN \n",
+ "\n",
+ " name_and_location \n",
+ "0 a0fad65b-1482-427d-b300-da8e63d14272 \n",
+ "1 45954c7c-d52d-48f0-a252-343b4f82e509 \n",
+ "2 0a1dea8a-3ae6-434f-be03-4b6447c3190c \n",
+ "3 5cb3b14f-71a4-4763-8bd1-40ad440c5eb8 \n",
+ "4 0de47b78-a256-4e36-93ad-1f4830b07c48 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "comments_14_28 = (\n",
+ " load_comments(BASE_PATH + \"comments-14-28-with-uuids.csv\")\n",
+ " .assign(date = lambda df: df[\"date\"].str.slice(0, 10))\n",
+ ") \n",
+ "\n",
+ "comments_14_28.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Classify comments\n",
+ "\n",
+ "In this step, we create derivative dataframes that classify each comment based on the language used in them. (Note: Because the formatting of comments can be inconsistent, the classification approach ignores whitespace.)\n",
+ "\n",
+ "The classifier takes a series of texts and a series of patterns to look for. Each text is labeled based on the __first__ pattern it matches, based on the sequential order of the patterns; if the text matches no pattern, it is labeled `[other]`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def classify(texts, patterns):\n",
+ " # Create a copy of the texts and remove whitespace\n",
+ " s = texts.copy().str.replace(r\"\\s+\", \"\")\n",
+ " \n",
+ " # Remove whitespace from classification patterns\n",
+ " without_whitespace = [ (re.sub(r\"\\s+\", \"\", pat), val)\n",
+ " for pat, val in patterns ]\n",
+ " \n",
+ " # An empty series, indexed identically to the original texts.\n",
+ " ix = pd.Series(None, index = texts.index)\n",
+ "\n",
+ " # As we progress through the matching, we will gradually\n",
+ " # fill `ix` in with the matches we've found.\n",
+ " \n",
+ " # Iterate through the classification patterns\n",
+ " for pat, val in without_whitespace:\n",
+ " # Determine which texts match\n",
+ " search_result = s.str.contains(pat, na = False)\n",
+ " matches = search_result.loc[lambda x: x == True]\n",
+ " \n",
+ " # For matches, update `ix` to indicate the pattern ID/description\n",
+ " ix.loc[matches.index] = val\n",
+ " \n",
+ " # Subset `s` so that it only contains unmatched texts\n",
+ " s = s.loc[s.index.difference(matches.index)]\n",
+ "\n",
+ " return ix.fillna(\"[other]\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def add_classification(df, patterns):\n",
+ " return (\n",
+ " df\n",
+ " .assign(group = lambda df: (\n",
+ " df[\"comments\"]\n",
+ " .pipe(classify, patterns)\n",
+ " ))\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def print_example_comments(df, n = 3, max_chars = 500):\n",
+ " for grp, subdf in df.groupby(\"group\"):\n",
+ " print(f\"=== {grp} ===\\n\")\n",
+ " \n",
+ " examples = (\n",
+ " subdf[\"comments\"]\n",
+ " .sample(n, random_state = RANDOM_STATE)\n",
+ " .pipe(lambda x: pd.np.where(\n",
+ " x.apply(len) > max_chars,\n",
+ " x.str.slice(0, max_chars) + \"[...]\",\n",
+ " x\n",
+ " ))\n",
+ " )\n",
+ " \n",
+ " print(\"\\n\\n\".join(examples) + \"\\n\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Docket 14-28\n",
+ "\n",
+ "BuzzFeed News identified the phrases below based on extensive examination of the 14-28 docket, and by cross-referencing them with [this December 2014 Sunlight Foundation analysis](http://web.archive.org/web/20150301070951/http://sunlightfoundation.com/blog/2014/12/16/one-group-dominates-the-second-round-of-net-neutrality-comments/).\n",
+ "\n",
+ "The `AC-` comments use language from American Commitment's comment campaign. It is possible that entities other than American Commitment submitted comments that used the same language. \n",
+ "\n",
+ "Note: The final phrase in the list below also appears alongside some of the other permutations; but because it is the final phrase in the list, only comments that don't match the other phrasings receive this classification.\n",
+ "\n",
+ "Please see this repository's landing page, and the associated BuzzFeed News article, for additional context. (E.g., not all comments are indivudally retreivable from the FCC's public portal.)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ac_patterns_main = [\n",
+ " # Earlier set of comments\n",
+ " \"The federal government can use their power over the internet to direct content\", \n",
+ " \"because of high barriers to entry and a disastrous lack of competition\", \n",
+ " \"federal bureaucrats will slow down the process and protect prevailing interests first\", \n",
+ " \"blossoming in America today, largely due to the internet\", \n",
+ " \"Government will make it impossible for internet providers to upgrade service\", \n",
+ " \"government will naturally favor entrenched special interests, rather than upstart companies\", \n",
+ " \"will begin to be mismanaged, like many other government-run industries\", \n",
+ " \"put directly in the hands of bureaucrats, instead of the free market\", \n",
+ " \"will result in worse service, even as costs continue to skyrocket\", \n",
+ " \"Our options for cheap, high-speed, high-performing internet providers\", \n",
+ "\n",
+ " # Later set of comments\n",
+ " \"Left-wing extremists have been crying wolf\", \n",
+ " \"The federal government needs to keep its hands off the Internet\", \n",
+ " \"Before the FCC places regulatory handcuffs on Internet providers\", \n",
+ " \"The notion that the internet is broken and needs repair is simply not true\", \n",
+ " \"will send the crown jewel of the US economy into an economic tailspin\", \n",
+ " \"no longer acting in the interests of the American people\", \n",
+ " \"just another slow-moving government-controlled mess\", \n",
+ " \"defend ourselves against power-hungry bureaucrats\", \n",
+ " \"simply another attempt by the federal government to take control of another sector of the economy\", \n",
+ " \"Millions of liberal fools demanding you reduce the Internet\", \n",
+ " \"FCC is clearly ignoring the will of the American people\", \n",
+ " \"devastate private investment with the force of an atomic bomb\", \n",
+ " \"without being slowed by bureaucratic inertia\", \n",
+ " \"A small fringe of the extremist left has been demanding\", \n",
+ " \"a tiny minority of far-left political activists\", \n",
+ " \"ultimate goal is to get rid of the media capitalists\", \n",
+ " \"created economic and human wreckage in their wake\", \n",
+ " \"increase its own power at the expense of the free people\", \n",
+ " \"it will have proven itself to be an unaccountable agency\", \n",
+ " \"subjecting it to 1930s-style regulations meant for telephone monopolies\", \n",
+ " \"Government regulation of Internet services would chase investment\", \n",
+ " \"it will seriously degrade the Internet we have\", \n",
+ " \"it can and should suffer the consequences\", \n",
+ " \"taking such reckless actions to gain control over the Internet\", \n",
+ " \"simply is no evidence to back up the dire claims of disaster\", \n",
+ "]\n",
+ "\n",
+ "ac_patterns_other = [\n",
+ " \"Like many Americans, I believe that the internet should remain free of government\",\n",
+ " \"As an American citizen, I wanted to voice my opposition to the FCC\",\n",
+ "]\n",
+ "\n",
+ "ac_pattern_desciptions = (\n",
+ " [ (p, f\"AC-{i:02d}\") for i, p in enumerate(ac_patterns_main) ] +\n",
+ " [ (p, f\"AC-other\") for p in ac_patterns_other ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Notes:\n",
+ "\n",
+ "- The `AC-XX` classification names below are based simply on the order in which they appear above. The numbers have no independent meaning.\n",
+ "\n",
+ "- The `AC-other` classification indicates that key language (the two phrases in `ac_patterns_other` above) from American Commitment appears in the comment, but not any of the other phrases."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "AC-00 1261\n",
+ "AC-01 1233\n",
+ "AC-02 1246\n",
+ "AC-03 1232\n",
+ "AC-04 1269\n",
+ "AC-05 1208\n",
+ "AC-06 1210\n",
+ "AC-07 1207\n",
+ "AC-08 1202\n",
+ "AC-09 1186\n",
+ "AC-10 25801\n",
+ "AC-11 25781\n",
+ "AC-12 25951\n",
+ "AC-13 26012\n",
+ "AC-14 25667\n",
+ "AC-15 25879\n",
+ "AC-16 25727\n",
+ "AC-17 26009\n",
+ "AC-18 25658\n",
+ "AC-19 25788\n",
+ "AC-20 25924\n",
+ "AC-21 25914\n",
+ "AC-22 25950\n",
+ "AC-23 25865\n",
+ "AC-24 25864\n",
+ "AC-25 25620\n",
+ "AC-26 26044\n",
+ "AC-27 25932\n",
+ "AC-28 25745\n",
+ "AC-29 26024\n",
+ "AC-30 25624\n",
+ "AC-31 25880\n",
+ "AC-32 25691\n",
+ "AC-33 25615\n",
+ "AC-34 25836\n",
+ "AC-other 6\n",
+ "[other] 1396620\n",
+ "Name: group, dtype: int64"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "comments_14_28_classified = (\n",
+ " comments_14_28\n",
+ " .pipe(\n",
+ " add_classification,\n",
+ " ac_pattern_desciptions\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "(\n",
+ " comments_14_28_classified\n",
+ " [\"group\"]\n",
+ " .value_counts()\n",
+ " .sort_index()\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The total number of comments and unique email addreses for all `AC-`-classified comments above:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " comments | \n",
+ " 658061 | \n",
+ "
\n",
+ " \n",
+ " unique_email_addresses | \n",
+ " 551855 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " count\n",
+ "comments 658061\n",
+ "unique_email_addresses 551855"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " comments_14_28_classified\n",
+ " .loc[lambda df: df[\"group\"] != \"[other]\"]\n",
+ " .pipe(lambda df: pd.Series({\n",
+ " \"comments\": len(df),\n",
+ " \"unique_email_addresses\": df[\"email_address\"].nunique()\n",
+ " }))\n",
+ " .to_frame(\"count\")\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Dates submitted\n",
+ "\n",
+ "The analysis below demonstrates that the following:\n",
+ "\n",
+ "- Comments `AC-00`-`AC-09` share a similar distribution of dates submitted\n",
+ "- Comments `AC-10`-`AC-34` also share a similar distribution of dates submitted, but distinct from `AC-00`-`AC-09`\n",
+ "\n",
+ "Additional notes:\n",
+ "\n",
+ "- Dates below are `MM-DD`, for 2014\n",
+ "\n",
+ "- Dates include only those with at least 200 total `AC-` classified comments (overall), to reduce noise of stray dates that contain relatively few matching comments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " date | \n",
+ " 07-14 | \n",
+ " 07-16 | \n",
+ " 07-17 | \n",
+ " 09-11 | \n",
+ " 09-12 | \n",
+ " 09-13 | \n",
+ " 09-14 | \n",
+ " 09-15 | \n",
+ " 09-16 | \n",
+ " 09-17 | \n",
+ " 09-18 | \n",
+ " 09-19 | \n",
+ " 09-22 | \n",
+ " 09-23 | \n",
+ " 09-24 | \n",
+ "
\n",
+ " \n",
+ " group | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " AC-00 | \n",
+ " 45 | \n",
+ " 909 | \n",
+ " 305 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " AC-01 | \n",
+ " 39 | \n",
+ " 905 | \n",
+ " 287 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " AC-02 | \n",
+ " 52 | \n",
+ " 888 | \n",
+ " 304 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " AC-03 | \n",
+ " 45 | \n",
+ " 893 | \n",
+ " 293 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " AC-04 | \n",
+ " 80 | \n",
+ " 902 | \n",
+ " 284 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " AC-05 | \n",
+ " 36 | \n",
+ " 872 | \n",
+ " 300 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " AC-06 | \n",
+ " 47 | \n",
+ " 906 | \n",
+ " 257 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " AC-07 | \n",
+ " 48 | \n",
+ " 859 | \n",
+ " 299 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " AC-08 | \n",
+ " 45 | \n",
+ " 854 | \n",
+ " 303 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " AC-09 | \n",
+ " 44 | \n",
+ " 878 | \n",
+ " 263 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " AC-10 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2478 | \n",
+ " 3896 | \n",
+ " 4591 | \n",
+ " 3510 | \n",
+ " 6458 | \n",
+ " 2167 | \n",
+ " 2143 | \n",
+ " 225 | \n",
+ " 63 | \n",
+ " 112 | \n",
+ " 80 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " AC-11 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2483 | \n",
+ " 3911 | \n",
+ " 4613 | \n",
+ " 3574 | \n",
+ " 6365 | \n",
+ " 2204 | \n",
+ " 2097 | \n",
+ " 231 | \n",
+ " 74 | \n",
+ " 118 | \n",
+ " 75 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " AC-12 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2481 | \n",
+ " 3897 | \n",
+ " 4577 | \n",
+ " 3550 | \n",
+ " 6527 | \n",
+ " 2207 | \n",
+ " 2197 | \n",
+ " 224 | \n",
+ " 76 | \n",
+ " 112 | \n",
+ " 79 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " AC-13 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2488 | \n",
+ " 3909 | \n",
+ " 4604 | \n",
+ " 3605 | \n",
+ " 6438 | \n",
+ " 2252 | \n",
+ " 2180 | \n",
+ " 241 | \n",
+ " 72 | \n",
+ " 123 | \n",
+ " 82 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " AC-14 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2470 | \n",
+ " 3911 | \n",
+ " 4528 | \n",
+ " 3541 | \n",
+ " 6278 | \n",
+ " 2234 | \n",
+ " 2168 | \n",
+ " 256 | \n",
+ " 71 | \n",
+ " 119 | \n",
+ " 75 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " AC-15 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2470 | \n",
+ " 3893 | \n",
+ " 4572 | \n",
+ " 3540 | \n",
+ " 6451 | \n",
+ " 2269 | \n",
+ " 2144 | \n",
+ " 254 | \n",
+ " 75 | \n",
+ " 117 | \n",
+ " 77 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " AC-16 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2477 | \n",
+ " 3917 | \n",
+ " 4562 | \n",
+ " 3541 | \n",
+ " 6385 | \n",
+ " 2196 | \n",
+ " 2144 | \n",
+ " 231 | \n",
+ " 69 | \n",
+ " 108 | \n",
+ " 83 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " AC-17 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2496 | \n",
+ " 3907 | \n",
+ " 4509 | \n",
+ " 3558 | \n",
+ " 6511 | \n",
+ " 2249 | \n",
+ " 2290 | \n",
+ " 231 | \n",
+ " 64 | \n",
+ " 105 | \n",
+ " 74 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " AC-18 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2498 | \n",
+ " 3903 | \n",
+ " 4569 | \n",
+ " 3584 | \n",
+ " 6352 | \n",
+ " 2190 | \n",
+ " 2067 | \n",
+ " 229 | \n",
+ " 55 | \n",
+ " 117 | \n",
+ " 80 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " AC-19 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2475 | \n",
+ " 3924 | \n",
+ " 4525 | \n",
+ " 3491 | \n",
+ " 6523 | \n",
+ " 2183 | \n",
+ " 2165 | \n",
+ " 258 | \n",
+ " 58 | \n",
+ " 94 | \n",
+ " 77 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " AC-20 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2481 | \n",
+ " 3910 | \n",
+ " 4568 | \n",
+ " 3572 | \n",
+ " 6562 | \n",
+ " 2153 | \n",
+ " 2150 | \n",
+ " 236 | \n",
+ " 86 | \n",
+ " 113 | \n",
+ " 79 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " AC-21 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2476 | \n",
+ " 3900 | \n",
+ " 4573 | \n",
+ " 3463 | \n",
+ " 6493 | \n",
+ " 2297 | \n",
+ " 2188 | \n",
+ " 239 | \n",
+ " 75 | \n",
+ " 113 | \n",
+ " 83 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " AC-22 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2486 | \n",
+ " 3886 | \n",
+ " 4557 | \n",
+ " 3522 | \n",
+ " 6507 | \n",
+ " 2225 | \n",
+ " 2240 | \n",
+ " 241 | \n",
+ " 82 | \n",
+ " 110 | \n",
+ " 81 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " AC-23 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2484 | \n",
+ " 3917 | \n",
+ " 4612 | \n",
+ " 3492 | \n",
+ " 6441 | \n",
+ " 2171 | \n",
+ " 2246 | \n",
+ " 235 | \n",
+ " 68 | \n",
+ " 108 | \n",
+ " 77 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " AC-24 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2469 | \n",
+ " 3908 | \n",
+ " 4637 | \n",
+ " 3511 | \n",
+ " 6507 | \n",
+ " 2175 | \n",
+ " 2169 | \n",
+ " 222 | \n",
+ " 75 | \n",
+ " 100 | \n",
+ " 78 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " AC-25 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2472 | \n",
+ " 3900 | \n",
+ " 4562 | \n",
+ " 3482 | \n",
+ " 6426 | \n",
+ " 2099 | \n",
+ " 2191 | \n",
+ " 240 | \n",
+ " 64 | \n",
+ " 95 | \n",
+ " 78 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " AC-26 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2491 | \n",
+ " 3916 | \n",
+ " 4581 | \n",
+ " 3575 | \n",
+ " 6462 | \n",
+ " 2257 | \n",
+ " 2210 | \n",
+ " 248 | \n",
+ " 82 | \n",
+ " 131 | \n",
+ " 78 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " AC-27 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2483 | \n",
+ " 3896 | \n",
+ " 4599 | \n",
+ " 3630 | \n",
+ " 6440 | \n",
+ " 2195 | \n",
+ " 2192 | \n",
+ " 241 | \n",
+ " 70 | \n",
+ " 95 | \n",
+ " 76 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " AC-28 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2475 | \n",
+ " 3914 | \n",
+ " 4566 | \n",
+ " 3542 | \n",
+ " 6366 | \n",
+ " 2233 | \n",
+ " 2140 | \n",
+ " 233 | \n",
+ " 62 | \n",
+ " 123 | \n",
+ " 75 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " AC-29 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2477 | \n",
+ " 3896 | \n",
+ " 4651 | \n",
+ " 3489 | \n",
+ " 6560 | \n",
+ " 2213 | \n",
+ " 2228 | \n",
+ " 244 | \n",
+ " 64 | \n",
+ " 112 | \n",
+ " 77 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " AC-30 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2501 | \n",
+ " 3896 | \n",
+ " 4585 | \n",
+ " 3554 | \n",
+ " 6332 | \n",
+ " 2184 | \n",
+ " 2100 | \n",
+ " 220 | \n",
+ " 56 | \n",
+ " 101 | \n",
+ " 78 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " AC-31 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2490 | \n",
+ " 3907 | \n",
+ " 4617 | \n",
+ " 3526 | \n",
+ " 6434 | \n",
+ " 2207 | \n",
+ " 2179 | \n",
+ " 245 | \n",
+ " 71 | \n",
+ " 107 | \n",
+ " 83 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " AC-32 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2477 | \n",
+ " 3896 | \n",
+ " 4655 | \n",
+ " 3462 | \n",
+ " 6367 | \n",
+ " 2175 | \n",
+ " 2161 | \n",
+ " 233 | \n",
+ " 56 | \n",
+ " 119 | \n",
+ " 78 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " AC-33 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2483 | \n",
+ " 3900 | \n",
+ " 4544 | \n",
+ " 3494 | \n",
+ " 6351 | \n",
+ " 2205 | \n",
+ " 2161 | \n",
+ " 218 | \n",
+ " 68 | \n",
+ " 98 | \n",
+ " 79 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " AC-34 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2477 | \n",
+ " 3886 | \n",
+ " 4588 | \n",
+ " 3572 | \n",
+ " 6441 | \n",
+ " 2182 | \n",
+ " 2173 | \n",
+ " 243 | \n",
+ " 74 | \n",
+ " 107 | \n",
+ " 81 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " AC-other | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "date 07-14 07-16 07-17 09-11 09-12 09-13 09-14 09-15 09-16 \\\n",
+ "group \n",
+ "AC-00 45 909 305 0 0 0 0 0 0 \n",
+ "AC-01 39 905 287 0 0 0 0 0 0 \n",
+ "AC-02 52 888 304 0 0 0 0 0 0 \n",
+ "AC-03 45 893 293 0 0 0 0 0 0 \n",
+ "AC-04 80 902 284 0 0 0 0 0 0 \n",
+ "AC-05 36 872 300 0 0 0 0 0 0 \n",
+ "AC-06 47 906 257 0 0 0 0 0 0 \n",
+ "AC-07 48 859 299 0 0 0 0 0 0 \n",
+ "AC-08 45 854 303 0 0 0 0 0 0 \n",
+ "AC-09 44 878 263 0 0 0 0 0 0 \n",
+ "AC-10 0 0 0 2478 3896 4591 3510 6458 2167 \n",
+ "AC-11 0 0 0 2483 3911 4613 3574 6365 2204 \n",
+ "AC-12 0 0 0 2481 3897 4577 3550 6527 2207 \n",
+ "AC-13 0 0 0 2488 3909 4604 3605 6438 2252 \n",
+ "AC-14 0 0 0 2470 3911 4528 3541 6278 2234 \n",
+ "AC-15 0 0 0 2470 3893 4572 3540 6451 2269 \n",
+ "AC-16 0 0 0 2477 3917 4562 3541 6385 2196 \n",
+ "AC-17 0 0 0 2496 3907 4509 3558 6511 2249 \n",
+ "AC-18 0 0 0 2498 3903 4569 3584 6352 2190 \n",
+ "AC-19 0 0 0 2475 3924 4525 3491 6523 2183 \n",
+ "AC-20 0 0 0 2481 3910 4568 3572 6562 2153 \n",
+ "AC-21 0 0 0 2476 3900 4573 3463 6493 2297 \n",
+ "AC-22 0 0 0 2486 3886 4557 3522 6507 2225 \n",
+ "AC-23 0 0 0 2484 3917 4612 3492 6441 2171 \n",
+ "AC-24 0 0 0 2469 3908 4637 3511 6507 2175 \n",
+ "AC-25 0 0 0 2472 3900 4562 3482 6426 2099 \n",
+ "AC-26 0 0 0 2491 3916 4581 3575 6462 2257 \n",
+ "AC-27 0 0 0 2483 3896 4599 3630 6440 2195 \n",
+ "AC-28 0 0 0 2475 3914 4566 3542 6366 2233 \n",
+ "AC-29 0 0 0 2477 3896 4651 3489 6560 2213 \n",
+ "AC-30 0 0 0 2501 3896 4585 3554 6332 2184 \n",
+ "AC-31 0 0 0 2490 3907 4617 3526 6434 2207 \n",
+ "AC-32 0 0 0 2477 3896 4655 3462 6367 2175 \n",
+ "AC-33 0 0 0 2483 3900 4544 3494 6351 2205 \n",
+ "AC-34 0 0 0 2477 3886 4588 3572 6441 2182 \n",
+ "AC-other 0 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ "date 09-17 09-18 09-19 09-22 09-23 09-24 \n",
+ "group \n",
+ "AC-00 0 0 0 0 0 0 \n",
+ "AC-01 0 0 0 0 0 0 \n",
+ "AC-02 0 0 0 0 0 0 \n",
+ "AC-03 0 0 0 0 0 0 \n",
+ "AC-04 0 0 0 0 0 0 \n",
+ "AC-05 0 0 0 0 0 0 \n",
+ "AC-06 0 0 0 0 0 0 \n",
+ "AC-07 0 0 0 0 0 0 \n",
+ "AC-08 0 0 0 0 0 0 \n",
+ "AC-09 0 0 0 0 0 0 \n",
+ "AC-10 2143 225 63 112 80 11 \n",
+ "AC-11 2097 231 74 118 75 12 \n",
+ "AC-12 2197 224 76 112 79 12 \n",
+ "AC-13 2180 241 72 123 82 12 \n",
+ "AC-14 2168 256 71 119 75 11 \n",
+ "AC-15 2144 254 75 117 77 11 \n",
+ "AC-16 2144 231 69 108 83 12 \n",
+ "AC-17 2290 231 64 105 74 11 \n",
+ "AC-18 2067 229 55 117 80 12 \n",
+ "AC-19 2165 258 58 94 77 14 \n",
+ "AC-20 2150 236 86 113 79 12 \n",
+ "AC-21 2188 239 75 113 83 12 \n",
+ "AC-22 2240 241 82 110 81 13 \n",
+ "AC-23 2246 235 68 108 77 12 \n",
+ "AC-24 2169 222 75 100 78 12 \n",
+ "AC-25 2191 240 64 95 78 11 \n",
+ "AC-26 2210 248 82 131 78 13 \n",
+ "AC-27 2192 241 70 95 76 13 \n",
+ "AC-28 2140 233 62 123 75 11 \n",
+ "AC-29 2228 244 64 112 77 12 \n",
+ "AC-30 2100 220 56 101 78 13 \n",
+ "AC-31 2179 245 71 107 83 13 \n",
+ "AC-32 2161 233 56 119 78 11 \n",
+ "AC-33 2161 218 68 98 79 12 \n",
+ "AC-34 2173 243 74 107 81 12 \n",
+ "AC-other 0 0 0 0 0 0 "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " comments_14_28_classified\n",
+ " .loc[lambda df: df[\"group\"] != \"[other]\"]\n",
+ " .assign(\n",
+ " date = lambda df: df[\"date\"].str.slice(5, 10)\n",
+ " )\n",
+ " .groupby([\"group\", \"date\"])\n",
+ " .size()\n",
+ " .unstack()\n",
+ " .fillna(0)\n",
+ " .astype(int)\n",
+ " .loc[:, lambda df: df.sum() >= 200]\n",
+ " \n",
+ " # Order columns by date\n",
+ " .pipe(lambda df: df[[c for c in sorted(df.columns)]])\n",
+ " \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Example comments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "=== AC-10 ===\n",
+ "\n",
+ "The Internet is not broken, and does not need to be fixed. Left-wing extremists have been crying wolf for the past decade about the harm to the Internet if the Federal government didn?t regulate it. Not only were they wrong, but the Internet has exploded with innovation. Do not regulate the Internet. The best way to keep it open and free is what has kept it open and free all along ? no government intervention.\n",
+ "\n",
+ "The Internet is not broken, and does not need to be fixed. Left-wing extremists have been crying wolf for the past decade about the harm to the Internet if the Federal government didn?t regulate it. Not only were they wrong, but the Internet has exploded with innovation. Do not regulate the Internet. The best way to keep it open and free is what has kept it open and free all along ? no government intervention.\n",
+ "\n",
+ "The Internet is not broken, and does not need to be fixed. Left-wing extremists have been crying wolf for the past decade about the harm to the Internet if the Federal government didn?t regulate it. Not only were they wrong, but the Internet has exploded with innovation. Do not regulate the Internet. The best way to keep it open and free is what has kept it open and free all along ? no government intervention.\n",
+ "\n",
+ "\n",
+ "=== AC-27 ===\n",
+ "\n",
+ "The American people are watching a Federal Communications Commission (FCC) that is not seeking to make the Internet better, but instead seeking to regulate it. The FCC could improve broadband delivery by auctioning off much-needed spectrum. Or it could get rid of some of its own burdensome rules that prevent companies from attracting investors and innovating. But instead of doing these things that would improve the Internet, the FCC is wasting its time in an obsessive drive to regulate. This [...]\n",
+ "\n",
+ "The American people are watching a Federal Communications Commission (FCC) that is not seeking to make the Internet better, but instead seeking to regulate it. The FCC could improve broadband delivery by auctioning off much-needed spectrum. Or it could get rid of some of its own burdensome rules that prevent companies from attracting investors and innovating. But instead of doing these things that would improve the Internet, the FCC is wasting its time in an obsessive drive to regulate. This[...]\n",
+ "\n",
+ "7522706506.txtThe American people are watching a Federal Communications Commission (FCC) that is not seeking to make the Internet better, but instead seeking to regulate it. The FCC could improve broadband delivery by auctioning off much-needed spectrum. Or it could get rid of some of its own burdensome rules that prevent companies from attracting investors and innovating. But instead of doing these things that would improve the Internet, the FCC is wasting its time in an obsessive drive to r[...]\n",
+ "\n",
+ "\n",
+ "=== [other] ===\n",
+ "\n",
+ "Dear Chairman Wheeler:We are writing to urge you to implement strong and unambiguous net neutrality rules that protect the Internet from discrimination and other practices that will impede its ability to serve our democracy, empower consumers, and fuel economic growth. Erecting toll booths or designating fast lanes on the information superhighway wouldstifle free speech, limit consumer choice, and thwart innovation. The FCC must act in a clear and decisive way to ensure the Internet does not be[...]\n",
+ "\n",
+ "Dear Chairman Wheeler:We are writing to urge you to implement strong and unambiguous net neutrality rules that protect the Internet from discrimination and other practices that will impede its ability to serve our democracy, empower consumers, and fuel economic growth. Erecting toll booths or designating fast lanes on the information superhighway wouldstifle free speech, limit consumer choice, and thwart innovation. The FCC must act in a clear and decisive way to ensure the Internet does not be[...]\n",
+ "\n",
+ "7522187451.txtDear Chairman Wheeler:We are writing to urge you to implement strong and unambiguous net neutrality rules that protect the Internet from discrimination and other practices that will impede its ability to serve our democracy, empower consumers, and fuel economic growth. Erecting toll booths or designating fast lanes on the information superhighway wouldstifle free speech, limit consumer choice, and thwart innovation. The FCC must act in a clear and decisive way to ensure the Intern[...]\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print_example_comments(\n",
+ " comments_14_28_classified\n",
+ " .loc[lambda df: df[\"group\"].isin([\"AC-10\", \"AC-27\", \"[other]\"])],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Compare the above timing and language for group `AC-27` to the [comment attributed to Annie Reeves](https://www.fcc.gov/ecfs/filing/6019076835), received by the FCC on September 15, 2014:\n",
+ "\n",
+ "> The American people are watching a Federal Communications Commission (FCC) that is not seeking to make the Internet better, but instead seeking to regulate it. The FCC could improve broadband delivery by auctioning off much-needed spectrum. Or it could get rid of some of its own burdensome rules that prevent companies from attracting investors and innovating. But instead of doing these things that would improve the Internet, the FCC is wasting its time in an obsessive drive to regulate. This tells the American people that once again, a Washington agency is working in aself-interested way to increase its own power at the expense of the free people it is meant to serve."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Docket 16-42\n",
+ "\n",
+ "Here, we identify two very large sets of comments in this docket, by searching for the short phrases below. Please see the associated BuzzFeed News article for context.\n",
+ "\n",
+ "The \"American Commitment\" set of comments is labeled as such because it uses language from [that organization's comment campaign](http://web.archive.org/web/20160403182941/https://www.americancommitment.org/cablebox-petition). (The text of comments appear to be generated algorithmically, selecting randomly from sets of pre-selected words and phrases, but the phrase used here for classification is static — it does not change across the comments.) To be sure, it is possible the comments were submitted by entities other than American Commitment, using the same language; the FCC's public portal does not specify who submitted these comments. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'over $200' 104816\n",
+ "American Commitment 101783\n",
+ "[other] 75175\n",
+ "Name: group, dtype: int64"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "comments_16_42_classified = (\n",
+ " comments_16_42\n",
+ " .pipe(\n",
+ " add_classification,\n",
+ " [\n",
+ " (\"cloud-based video on demand, and apps providing news\", \"American Commitment\"),\n",
+ " (\"A cable subscriber pays over \\$200\", \"'over $200'\"),\n",
+ " ]\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "(\n",
+ " comments_16_42_classified\n",
+ " [\"group\"]\n",
+ " .value_counts()\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Dates submitted, by two main groups of comments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2016-04-19 10499\n",
+ "2016-04-20 59247\n",
+ "2016-04-21 35070\n",
+ "Name: date, dtype: int64"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " comments_16_42_classified\n",
+ " .loc[lambda df: df[\"group\"] == \"'over $200'\"]\n",
+ " [\"date\"]\n",
+ " .value_counts()\n",
+ " .sort_index()\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2016-02-22 1\n",
+ "2016-05-16 12293\n",
+ "2016-05-17 55852\n",
+ "2016-05-18 33637\n",
+ "Name: date, dtype: int64"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " comments_16_42_classified\n",
+ " .loc[lambda df: df[\"group\"] == \"American Commitment\"]\n",
+ " [\"date\"]\n",
+ " .value_counts()\n",
+ " .sort_index()\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "*Note: The 2016-02-22 comment above appears to stem from a data-entry mistake on the FCC's website. There, [the comment](https://www.fcc.gov/ecfs/filing/60001484317)'s text seems to suggests that the language actually came from a [comment with ID 60001843102](https://www.fcc.gov/ecfs/filing/60001843102); that comment, in turn, says it was received on May 18, 2016.*"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Example comments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "=== 'over $200' ===\n",
+ "\n",
+ "60001650840.txtA cable subscriber pays over $200 per year to rent a box from the cable companies that are already protected by government. This kills competition, limits consumer choice, and lifts up cable profits that are already excessive.Allow the free market to work and unlock the box to open competition and end the monopoly that cable companies have over our televisions.Page 1\n",
+ "\n",
+ "60001633497.txtA cable subscriber pays over $200 per year to rent a box from the cable companies that are already protected by government. This kills competition, limits consumer choice, and lifts up cable profits that are already excessive.Allow the free market to work and unlock the box to open competition and end the monopoly that cable companies have over our televisions.Page 1\n",
+ "\n",
+ "60001621406.txtA cable subscriber pays over $200 per year to rent a box from the cable companies that are already protected by government. This kills competition, limits consumer choice, and lifts up cable profits that are already excessive.Allow the free market to work and unlock the box to open competition and end the monopoly that cable companies have over our televisions.Page 1\n",
+ "\n",
+ "\n",
+ "=== American Commitment ===\n",
+ "\n",
+ "60001870988.txtThe marketplace for video content is thriving and extremely competitive, offering a vast array of video streaming services, cloud-based video on demand, and apps providing news, cinema and programming. This market is rapidly innovating beyond thetraditional set-top box to new applications and devices with more choices than ever. Past government attempts to control set-top boxes have been a complete failure. Yet another failed attempt at top-down government regulation will only pu[...]\n",
+ "\n",
+ "60001888486.txtThe exchange for video content is booming and incredibly competitive, offering a wide array of video streaming services, cloud-based video on demand, and apps providing news, cinema and programming. This market is swiftly innovating beyond thetraditional set-top box to new applications and devices with more options than ever. Past Commission attempts to control set-top boxes have been a complete failure. Yet another failed attempt at one-size-fits-all government regulation will o[...]\n",
+ "\n",
+ "60001883996.txtThe exchange for video content is roaring and incredibly competitive, offering a vast array of video streaming services, cloud-based video on demand, and apps providing news, cinema and programming. This market is quickly innovating beyond thetraditional set-top box to new applications and devices with more options than ever. Past FCC attempts to regulate set-top boxes have been a complete failure. Yet another failed attempt at heavy-handed government regulation will only put the[...]\n",
+ "\n",
+ "\n",
+ "=== [other] ===\n",
+ "\n",
+ "60001976192.txtI oppose unnecessary set-top box regulations that will mean higher bills, fewer choices, and less privacy on TV. The television and video market today is full of great choices, why put such a healthy market at risk with complex and unnecessary new mandates?Page 1\n",
+ "\n",
+ "60001962194.txtI oppose unnecessary set-top box regulations that will mean higher bills, fewer choices, and less privacy on TV. The television and video market today is full of great choices, why put such a healthy market at risk with complex and unnecessary new mandates?Page 1\n",
+ "\n",
+ "60001991447.txtI oppose unnecessary set-top box regulations that will mean higher bills, fewer choices, and less privacy on TV. The television and video market today is full of great choices, why put such a healthy market at risk with complex and unnecessary new mandates?Page 1\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print_example_comments(comments_16_42_classified)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Compare the above timing and language to the [comment attributed to Annie Reeves](https://www.fcc.gov/ecfs/filing/60001803771), received by the FCC on May 17, 2016:\n",
+ "\n",
+ "> The market for video content is booming and extremely competitive, offering a vast array of video streaming services, cloud-based video on demand, and apps providing news, cinema and programming. This market is swiftly innovating beyond the traditional set-top box to new applications and devices with more options than ever. Past Commission attempts to regulate set-top boxes have been a complete failure. Yet another failed attempt at heavy-handed government regulation will only stifle innovation and benefit companies with political influence rather than companies thatprovide what consumers want. We don't need the federal government to fix what isn'tbroken -- I urge you to reject the proposed rule."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Examine email address crossover between 16-42 and bulk-uploaded 17-108 comments\n",
+ "\n",
+ "Here, we calculate the proportion of commenters from docket 16-42 that later appeared in comments bulk-uploaded to docket 17-108, and observe a very high rate of overlap between the email addresses associated with comments that used American Commitment's language in docket 16-42 and the email addresses listed in comments bulk-uploaded by Media Bridge. We find the same for commenters' full names plus physical addresses."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " num_emails | \n",
+ " email_isin_17_108_nonmb | \n",
+ " email_isin_17_108_mb | \n",
+ " name_and_location_isin_17_108_mb | \n",
+ "
\n",
+ " \n",
+ " group | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " American Commitment | \n",
+ " 100252 | \n",
+ " 0.0231 | \n",
+ " 0.9987 | \n",
+ " 0.9987 | \n",
+ "
\n",
+ " \n",
+ " 'over $200' | \n",
+ " 100482 | \n",
+ " 0.0243 | \n",
+ " 0.0601 | \n",
+ " 0.0566 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " num_emails email_isin_17_108_nonmb \\\n",
+ "group \n",
+ "American Commitment 100252 0.0231 \n",
+ "'over $200' 100482 0.0243 \n",
+ "\n",
+ " email_isin_17_108_mb name_and_location_isin_17_108_mb \n",
+ "group \n",
+ "American Commitment 0.9987 0.9987 \n",
+ "'over $200' 0.0601 0.0566 "
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " comments_16_42_classified\n",
+ " [[\n",
+ " \"email_address\",\n",
+ " \"name_and_location\",\n",
+ " \"group\",\n",
+ " ]]\n",
+ " .drop_duplicates()\n",
+ " .dropna()\n",
+ " .assign(\n",
+ " email_isin_17_108_nonmb = lambda df: (\n",
+ " df[\"email_address\"].notnull() & df[\"email_address\"].isin(\n",
+ " bulk_uploads_17_108\n",
+ " .loc[lambda df: df[\"uploader\"] != \"shane@mediabridgellc.com\"]\n",
+ " [\"email_address\"]\n",
+ " )\n",
+ " ),\n",
+ " email_isin_17_108_mb = lambda df: (\n",
+ " df[\"email_address\"].notnull() & df[\"email_address\"].isin(\n",
+ " bulk_uploads_17_108\n",
+ " .loc[lambda df: df[\"uploader\"] == \"shane@mediabridgellc.com\"]\n",
+ " [\"email_address\"]\n",
+ " )\n",
+ " ),\n",
+ " name_and_location_isin_17_108_mb = lambda df: (\n",
+ " df[\"name_and_location\"].isin(\n",
+ " bulk_uploads_17_108\n",
+ " .loc[lambda df: df[\"uploader\"] == \"shane@mediabridgellc.com\"]\n",
+ " [\"name_and_location\"]\n",
+ " )\n",
+ " ),\n",
+ " )\n",
+ " .groupby(\"group\")\n",
+ " .pipe(lambda grp: pd.DataFrame({\n",
+ " \"num_emails\": grp.size(),\n",
+ " \"email_isin_17_108_nonmb\": grp[\"email_isin_17_108_nonmb\"].mean().round(4),\n",
+ " \"email_isin_17_108_mb\": grp[\"email_isin_17_108_mb\"].mean().round(4),\n",
+ " \"name_and_location_isin_17_108_mb\": grp[\"name_and_location_isin_17_108_mb\"].mean().round(4),\n",
+ " }))\n",
+ " .loc[lambda df: df[\"num_emails\"] >= 1000]\n",
+ " .sort_values(\"email_isin_17_108_mb\", ascending = False)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Analyze 17-108 bulk-uploads vis-a-vis Have I Been Pwned\n",
+ "\n",
+ "In this section, we take random samples of email addresses the comments bulk-uploaded to Docket 17-108, and calculate the rates at which they have appeared in the data breaches tracked by Have I Been Pwned. We focus on the accounts that uploaded comments containing 10,000+ distinct email addresses.\n",
+ "\n",
+ "*Note: The HIBP data has already been been fetched and saved, but the code used to fetch the data is included here for reference, and for reuse by other researchers.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "try:\n",
+ " uuid_lookup = pd.read_csv(\n",
+ " BASE_PATH + \"bulk-uploads-17-108-uuid-lookup.csv\",\n",
+ " dtype = str,\n",
+ " )\n",
+ " \n",
+ " assert uuid_lookup[\"email_address_uuid\"].value_counts().max() == 1\n",
+ " print(f\"{len(uuid_lookup):,d}\")\n",
+ "except:\n",
+ " uuid_lookup = pd.DataFrame(None, columns = [ \"email_address\", \"email_address_uuid\" ])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "BASE_HIBP_URL = \"https://haveibeenpwned.com/api/v3/breachedaccount/\"\n",
+ "from json import JSONDecodeError\n",
+ "\n",
+ "def fetch_hipb_results(email_address):\n",
+ " while True:\n",
+ " try:\n",
+ " res = requests.get(\n",
+ " f\"{BASE_HIBP_URL}{email_address.strip()}\",\n",
+ " headers = {\n",
+ " 'hibp-api-key': HIBP_KEY,\n",
+ " },\n",
+ " )\n",
+ " if res.from_cache == False:\n",
+ " time.sleep(1.5)\n",
+ "\n",
+ " # Check that JSON is parseable\n",
+ " if res.content != b\"\":\n",
+ " res.json()\n",
+ " if \"message\" in res.json():\n",
+ " raise Exception(\"HIPB error: {res.json()['message']}\")\n",
+ "\n",
+ " except requests.RequestException:\n",
+ " sys.stderr.write(f\"\\nException; sleeping for 10 seconds\\n\")\n",
+ " time.sleep(10) \n",
+ " continue\n",
+ " \n",
+ " except JSONDecodeError as e:\n",
+ " sys.stderr.write(f\"\\nERROR: <{email_address}>\\n\")\n",
+ " sys.stderr.write(f\"{e}\\n\")\n",
+ " sys.stderr.write(f\"{res.content}\\n\")\n",
+ " return [ { \"email_address\": email_address, \"breach\": \"[error]\" } ]\n",
+ "\n",
+ " if res.status_code == 429:\n",
+ " sleep_int = int(res.headers[\"Retry-After\"])\n",
+ " sys.stderr.write(f\"\\nSleeping for {sleep_int + 1} seconds\")\n",
+ " time.sleep(sleep_int)\n",
+ " continue\n",
+ " \n",
+ " if res.content == b\"\" or res.status_code == 404:\n",
+ " return [ { \"email_address\": email_address, \"breach\": \"[none]\" } ]\n",
+ "\n",
+ " else:\n",
+ " return [ { \"email_address\": email_address, \"breach\": x[\"Name\"] } for x in res.json() ]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The following function creates a sample — or a grouped set of samples — from a given set of comments. Before sampling, the code removes blank email addresses and those with non-standard characters."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def create_sample(df, grouping = [], n = 1000, random_state = RANDOM_STATE):\n",
+ " clean = (\n",
+ " df\n",
+ " .loc[lambda df: df[\"email_address\"].notnull()]\n",
+ " .loc[lambda df: df[\"email_address_nonstandard\"] == 0]\n",
+ " .drop_duplicates(subset = [ \"email_address\" ] + grouping)\n",
+ " )\n",
+ " \n",
+ " sampler = lambda df: df.sample(n, random_state = random_state)\n",
+ " \n",
+ " if len(grouping):\n",
+ " return (\n",
+ " clean\n",
+ " .groupby(grouping)\n",
+ " .apply(sampler)\n",
+ " .reset_index(drop = True)\n",
+ " )\n",
+ " else:\n",
+ " return clean.pipe(sampler)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_breaches(df, save_path, save = True, use_saved = True):\n",
+ " if use_saved and os.path.exists(save_path):\n",
+ " return pd.read_csv(save_path, dtype = str)\n",
+ " else:\n",
+ " breaches_raw = pd.concat(map(pd.DataFrame, (\n",
+ " df\n",
+ " .rename(columns = {\n",
+ " \"email_address\": \"email_address_uuid\"\n",
+ " })\n",
+ " .merge(\n",
+ " uuid_lookup,\n",
+ " how = \"left\",\n",
+ " on = [ \"email_address_uuid\" ]\n",
+ " )\n",
+ " [\"email_address\"]\n",
+ " .progress_apply(fetch_hipb_results)\n",
+ " ))).drop_duplicates()\n",
+ " \n",
+ " breaches = (\n",
+ " breaches_raw\n",
+ " .merge(\n",
+ " uuid_lookup,\n",
+ " how = \"left\",\n",
+ " on = [ \"email_address\" ]\n",
+ " )\n",
+ " .drop(columns = [ \"email_address\" ])\n",
+ " .rename(columns = {\n",
+ " \"email_address_uuid\": \"email_address\",\n",
+ " })\n",
+ " )\n",
+ " \n",
+ " if save:\n",
+ " breaches.to_csv(save_path, index = False)\n",
+ "\n",
+ " return breaches"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The function below calculates the breach rates for groups of sampled comments, for each breach found."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def calculate_breach_rates(sample, breaches):\n",
+ " return (\n",
+ " sample\n",
+ " [[\n",
+ " \"email_address\",\n",
+ " \"uploader\",\n",
+ " ]]\n",
+ " \n",
+ " .merge(\n",
+ " breaches,\n",
+ " how = \"left\",\n",
+ " on = [ \"email_address\" ],\n",
+ " )\n",
+ " .assign(breached = 1)\n",
+ " .set_index([\n",
+ " \"uploader\",\n",
+ " \"email_address\",\n",
+ " \"breach\",\n",
+ " ])\n",
+ " [\"breached\"]\n",
+ " .unstack()\n",
+ " .fillna(0)\n",
+ " .astype(int)\n",
+ " # At this point, we have a matrix of uploader+email x breach\n",
+ " # where the values are 1 if breached and 0 if not\n",
+ " \n",
+ " # Now, we group by uploader and calculate the proportion of\n",
+ " # emails breached\n",
+ " .groupby([ \"uploader\" ])\n",
+ " .mean()\n",
+ " \n",
+ " # Then we return the data frame to a \"tidy\" format:\n",
+ " # uploader|breach|rate\n",
+ " .stack()\n",
+ " .sort_values(ascending = False)\n",
+ " .to_frame(\"rate\")\n",
+ " .reset_index()\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 17-108 by bulk uploader\n",
+ "\n",
+ "Limited here to the accounts that uploaded comments containing 10,000+ distinct email addresses."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " date | \n",
+ " comments | \n",
+ " file | \n",
+ " uploader | \n",
+ " email_address_nonstandard | \n",
+ " email_address | \n",
+ " email_domain | \n",
+ " name_and_location | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 5/15/17 | \n",
+ " In 2015, wealthy leftist billionaires and powe... | \n",
+ " FOI-14090-2017527.csv | \n",
+ " esmisc@mac.com | \n",
+ " 0 | \n",
+ " 939bfae2-62d1-47de-b009-c2abc6b681f5 | \n",
+ " yahoo.com | \n",
+ " 8930069a-021b-4263-9c3b-a3923af9a9dc | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 8/5/17 | \n",
+ " Before leaving office, the Obama Administratio... | \n",
+ " CFIF_1_25000_08052017_4 (1).csv | \n",
+ " esmisc@mac.com | \n",
+ " 0 | \n",
+ " f9a12339-56cb-4540-9adc-fc6238428f49 | \n",
+ " gmail.com | \n",
+ " 6c65ec31-5135-4500-99c5-bba309b415fb | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 8/6/17 | \n",
+ " Before leaving office, the Obama Administratio... | \n",
+ " CFIF_1_25000_08062017_2.csv | \n",
+ " esmisc@mac.com | \n",
+ " 0 | \n",
+ " fcf0991a-0ed7-408b-8e52-4735baccd906 | \n",
+ " yahoo.com | \n",
+ " 6dfa9546-ad61-404a-bec7-48464be021b4 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 7/29/17 | \n",
+ " Before leaving office, the Obama Administratio... | \n",
+ " CFIF_1_40000_07292017.csv | \n",
+ " esmisc@mac.com | \n",
+ " 0 | \n",
+ " ee33e2a5-854f-471b-adb1-1ff62d69bf46 | \n",
+ " gmail.com | \n",
+ " 6d99eb3f-9242-440d-be2a-c7f7ae3b4e91 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5/9/17 | \n",
+ " Obama's Federal Communications Commission (FCC... | \n",
+ " T2017510-2.csv | \n",
+ " esmisc@mac.com | \n",
+ " 0 | \n",
+ " 4d294840-8365-4d34-a5c4-c09f6b8bc01d | \n",
+ " icloud.com | \n",
+ " d132203a-a146-4043-b097-d6606498309f | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " date comments \\\n",
+ "0 5/15/17 In 2015, wealthy leftist billionaires and powe... \n",
+ "1 8/5/17 Before leaving office, the Obama Administratio... \n",
+ "2 8/6/17 Before leaving office, the Obama Administratio... \n",
+ "3 7/29/17 Before leaving office, the Obama Administratio... \n",
+ "4 5/9/17 Obama's Federal Communications Commission (FCC... \n",
+ "\n",
+ " file uploader email_address_nonstandard \\\n",
+ "0 FOI-14090-2017527.csv esmisc@mac.com 0 \n",
+ "1 CFIF_1_25000_08052017_4 (1).csv esmisc@mac.com 0 \n",
+ "2 CFIF_1_25000_08062017_2.csv esmisc@mac.com 0 \n",
+ "3 CFIF_1_40000_07292017.csv esmisc@mac.com 0 \n",
+ "4 T2017510-2.csv esmisc@mac.com 0 \n",
+ "\n",
+ " email_address email_domain \\\n",
+ "0 939bfae2-62d1-47de-b009-c2abc6b681f5 yahoo.com \n",
+ "1 f9a12339-56cb-4540-9adc-fc6238428f49 gmail.com \n",
+ "2 fcf0991a-0ed7-408b-8e52-4735baccd906 yahoo.com \n",
+ "3 ee33e2a5-854f-471b-adb1-1ff62d69bf46 gmail.com \n",
+ "4 4d294840-8365-4d34-a5c4-c09f6b8bc01d icloud.com \n",
+ "\n",
+ " name_and_location \n",
+ "0 8930069a-021b-4263-9c3b-a3923af9a9dc \n",
+ "1 6c65ec31-5135-4500-99c5-bba309b415fb \n",
+ "2 6dfa9546-ad61-404a-bec7-48464be021b4 \n",
+ "3 6d99eb3f-9242-440d-be2a-c7f7ae3b4e91 \n",
+ "4 d132203a-a146-4043-b097-d6606498309f "
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_17_108_bulk_uploads = (\n",
+ " bulk_uploads_17_108\n",
+ " .loc[lambda df: df[\"uploader\"].isin(\n",
+ " uploader_metrics\n",
+ " .loc[lambda df: df[\"unique_emails\"] >= 10000]\n",
+ " .index\n",
+ " )]\n",
+ " .pipe(\n",
+ " create_sample,\n",
+ " grouping = [ \"uploader\" ],\n",
+ " n = 1000\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "sample_17_108_bulk_uploads.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "esmisc@mac.com 1000\n",
+ "fccfreedom@hmamail.com 1000\n",
+ "karen@momsrising.org 1000\n",
+ "kurt@demandprogress.org 1000\n",
+ "meaghan@mandatemedia.com 1000\n",
+ "mike@fightforthefuture.org 1000\n",
+ "ncatalano@ofa.us 1000\n",
+ "shane@mediabridgellc.com 1000\n",
+ "Name: uploader, dtype: int64"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_17_108_bulk_uploads[\"uploader\"].value_counts().sort_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " breach | \n",
+ " email_address | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 8tracks | \n",
+ " 939bfae2-62d1-47de-b009-c2abc6b681f5 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Animoto | \n",
+ " 939bfae2-62d1-47de-b009-c2abc6b681f5 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " MindJolt | \n",
+ " 939bfae2-62d1-47de-b009-c2abc6b681f5 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ModernBusinessSolutions | \n",
+ " 939bfae2-62d1-47de-b009-c2abc6b681f5 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " RiverCityMedia | \n",
+ " 939bfae2-62d1-47de-b009-c2abc6b681f5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " breach email_address\n",
+ "0 8tracks 939bfae2-62d1-47de-b009-c2abc6b681f5\n",
+ "1 Animoto 939bfae2-62d1-47de-b009-c2abc6b681f5\n",
+ "2 MindJolt 939bfae2-62d1-47de-b009-c2abc6b681f5\n",
+ "3 ModernBusinessSolutions 939bfae2-62d1-47de-b009-c2abc6b681f5\n",
+ "4 RiverCityMedia 939bfae2-62d1-47de-b009-c2abc6b681f5"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "breaches_17_108_bulk_uploads = (\n",
+ " sample_17_108_bulk_uploads\n",
+ " .pipe(get_breaches, \"../data/breaches-17-108-bulk-uploads-sample.csv\")\n",
+ ")\n",
+ "\n",
+ "breaches_17_108_bulk_uploads.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Most common breach-uploader combinations:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " uploader | \n",
+ " breach | \n",
+ " rate | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " shane@mediabridgellc.com | \n",
+ " ModernBusinessSolutions | \n",
+ " 0.942 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " shane@mediabridgellc.com | \n",
+ " RiverCityMedia | \n",
+ " 0.807 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " fccfreedom@hmamail.com | \n",
+ " VerificationsIO | \n",
+ " 0.782 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " shane@mediabridgellc.com | \n",
+ " VerificationsIO | \n",
+ " 0.743 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " fccfreedom@hmamail.com | \n",
+ " RiverCityMedia | \n",
+ " 0.645 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " esmisc@mac.com | \n",
+ " VerificationsIO | \n",
+ " 0.625 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " esmisc@mac.com | \n",
+ " RiverCityMedia | \n",
+ " 0.565 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " fccfreedom@hmamail.com | \n",
+ " ModernBusinessSolutions | \n",
+ " 0.466 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " ncatalano@ofa.us | \n",
+ " VerificationsIO | \n",
+ " 0.463 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " karen@momsrising.org | \n",
+ " VerificationsIO | \n",
+ " 0.459 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " mike@fightforthefuture.org | \n",
+ " VerificationsIO | \n",
+ " 0.435 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " meaghan@mandatemedia.com | \n",
+ " VerificationsIO | \n",
+ " 0.435 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " kurt@demandprogress.org | \n",
+ " VerificationsIO | \n",
+ " 0.412 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " karen@momsrising.org | \n",
+ " RiverCityMedia | \n",
+ " 0.377 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " meaghan@mandatemedia.com | \n",
+ " RiverCityMedia | \n",
+ " 0.364 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " ncatalano@ofa.us | \n",
+ " RiverCityMedia | \n",
+ " 0.345 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " esmisc@mac.com | \n",
+ " ModernBusinessSolutions | \n",
+ " 0.345 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " mike@fightforthefuture.org | \n",
+ " RiverCityMedia | \n",
+ " 0.344 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " ncatalano@ofa.us | \n",
+ " LinkedIn | \n",
+ " 0.339 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " kurt@demandprogress.org | \n",
+ " RiverCityMedia | \n",
+ " 0.323 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " uploader breach rate\n",
+ "0 shane@mediabridgellc.com ModernBusinessSolutions 0.942\n",
+ "1 shane@mediabridgellc.com RiverCityMedia 0.807\n",
+ "2 fccfreedom@hmamail.com VerificationsIO 0.782\n",
+ "3 shane@mediabridgellc.com VerificationsIO 0.743\n",
+ "4 fccfreedom@hmamail.com RiverCityMedia 0.645\n",
+ "5 esmisc@mac.com VerificationsIO 0.625\n",
+ "6 esmisc@mac.com RiverCityMedia 0.565\n",
+ "7 fccfreedom@hmamail.com ModernBusinessSolutions 0.466\n",
+ "8 ncatalano@ofa.us VerificationsIO 0.463\n",
+ "9 karen@momsrising.org VerificationsIO 0.459\n",
+ "10 mike@fightforthefuture.org VerificationsIO 0.435\n",
+ "11 meaghan@mandatemedia.com VerificationsIO 0.435\n",
+ "12 kurt@demandprogress.org VerificationsIO 0.412\n",
+ "13 karen@momsrising.org RiverCityMedia 0.377\n",
+ "14 meaghan@mandatemedia.com RiverCityMedia 0.364\n",
+ "15 ncatalano@ofa.us RiverCityMedia 0.345\n",
+ "16 esmisc@mac.com ModernBusinessSolutions 0.345\n",
+ "17 mike@fightforthefuture.org RiverCityMedia 0.344\n",
+ "18 ncatalano@ofa.us LinkedIn 0.339\n",
+ "19 kurt@demandprogress.org RiverCityMedia 0.323"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " calculate_breach_rates(\n",
+ " sample_17_108_bulk_uploads,\n",
+ " breaches_17_108_bulk_uploads,\n",
+ " )\n",
+ " .head(20)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Modern Business Solutions breaches only:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " uploader | \n",
+ " breach | \n",
+ " rate | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " shane@mediabridgellc.com | \n",
+ " ModernBusinessSolutions | \n",
+ " 0.942 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " fccfreedom@hmamail.com | \n",
+ " ModernBusinessSolutions | \n",
+ " 0.466 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " esmisc@mac.com | \n",
+ " ModernBusinessSolutions | \n",
+ " 0.345 | \n",
+ "
\n",
+ " \n",
+ " 116 | \n",
+ " meaghan@mandatemedia.com | \n",
+ " ModernBusinessSolutions | \n",
+ " 0.114 | \n",
+ "
\n",
+ " \n",
+ " 121 | \n",
+ " ncatalano@ofa.us | \n",
+ " ModernBusinessSolutions | \n",
+ " 0.106 | \n",
+ "
\n",
+ " \n",
+ " 130 | \n",
+ " karen@momsrising.org | \n",
+ " ModernBusinessSolutions | \n",
+ " 0.099 | \n",
+ "
\n",
+ " \n",
+ " 145 | \n",
+ " kurt@demandprogress.org | \n",
+ " ModernBusinessSolutions | \n",
+ " 0.087 | \n",
+ "
\n",
+ " \n",
+ " 150 | \n",
+ " mike@fightforthefuture.org | \n",
+ " ModernBusinessSolutions | \n",
+ " 0.086 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " uploader breach rate\n",
+ "0 shane@mediabridgellc.com ModernBusinessSolutions 0.942\n",
+ "7 fccfreedom@hmamail.com ModernBusinessSolutions 0.466\n",
+ "16 esmisc@mac.com ModernBusinessSolutions 0.345\n",
+ "116 meaghan@mandatemedia.com ModernBusinessSolutions 0.114\n",
+ "121 ncatalano@ofa.us ModernBusinessSolutions 0.106\n",
+ "130 karen@momsrising.org ModernBusinessSolutions 0.099\n",
+ "145 kurt@demandprogress.org ModernBusinessSolutions 0.087\n",
+ "150 mike@fightforthefuture.org ModernBusinessSolutions 0.086"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " calculate_breach_rates(\n",
+ " sample_17_108_bulk_uploads,\n",
+ " breaches_17_108_bulk_uploads,\n",
+ " )\n",
+ " .loc[lambda df: df[\"breach\"] == \"ModernBusinessSolutions\"] \n",
+ " .head(20)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Larger 17-108 Media Bridge sample (10,000 addresses), for more precise rates"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " date | \n",
+ " comments | \n",
+ " file | \n",
+ " uploader | \n",
+ " email_address_nonstandard | \n",
+ " email_address | \n",
+ " email_domain | \n",
+ " name_and_location | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3236533 | \n",
+ " 5/14/17 | \n",
+ " Dear Chairman Pai, I am concerned about Inter... | \n",
+ " Batch-A4.csv | \n",
+ " shane@mediabridgellc.com | \n",
+ " 0 | \n",
+ " 44a8867c-3332-403f-9b34-560c054bd728 | \n",
+ " gmail.com | \n",
+ " 058cbe31-9c92-4509-a8d9-50f84a1cf1ae | \n",
+ "
\n",
+ " \n",
+ " 2723370 | \n",
+ " 5/15/17 | \n",
+ " Dear Mr. Pai, Regarding the Obama takeover of... | \n",
+ " Batch-A2.csv | \n",
+ " shane@mediabridgellc.com | \n",
+ " 0 | \n",
+ " f9ad6e74-115c-4ccd-8a4e-f1e408423942 | \n",
+ " icloud.com | \n",
+ " 99c87532-c65e-42fa-97bd-0438d3ff504c | \n",
+ "
\n",
+ " \n",
+ " 56337 | \n",
+ " 5/14/17 | \n",
+ " Chairman Pai: Hi, I'd like to comment on Titl... | \n",
+ " file-i.csv | \n",
+ " shane@mediabridgellc.com | \n",
+ " 0 | \n",
+ " ad374006-bbed-4c8b-932d-e7dfacce1a29 | \n",
+ " aol.com | \n",
+ " 6752ca9d-c848-4f3d-a66c-49073afe2458 | \n",
+ "
\n",
+ " \n",
+ " 592618 | \n",
+ " 05/16/2017 | \n",
+ " The Title II order created a gaping gap in pri... | \n",
+ " batch-d-4.csv | \n",
+ " shane@mediabridgellc.com | \n",
+ " 0 | \n",
+ " c16e61f3-bd1f-4e50-95dd-830f5a219543 | \n",
+ " gmail.com | \n",
+ " ebccbc49-fa14-404f-bf83-b0ef00d48e78 | \n",
+ "
\n",
+ " \n",
+ " 1014169 | \n",
+ " 05/15/2017 | \n",
+ " Dear Chairman Pai, I'm very worried about Net... | \n",
+ " batch-b-5.csv | \n",
+ " shane@mediabridgellc.com | \n",
+ " 0 | \n",
+ " b252ca16-b5a2-4c61-9034-6e365bec0beb | \n",
+ " gmail.com | \n",
+ " bc102bc2-d454-44a4-974d-d0b1a377f392 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " date comments \\\n",
+ "3236533 5/14/17 Dear Chairman Pai, I am concerned about Inter... \n",
+ "2723370 5/15/17 Dear Mr. Pai, Regarding the Obama takeover of... \n",
+ "56337 5/14/17 Chairman Pai: Hi, I'd like to comment on Titl... \n",
+ "592618 05/16/2017 The Title II order created a gaping gap in pri... \n",
+ "1014169 05/15/2017 Dear Chairman Pai, I'm very worried about Net... \n",
+ "\n",
+ " file uploader email_address_nonstandard \\\n",
+ "3236533 Batch-A4.csv shane@mediabridgellc.com 0 \n",
+ "2723370 Batch-A2.csv shane@mediabridgellc.com 0 \n",
+ "56337 file-i.csv shane@mediabridgellc.com 0 \n",
+ "592618 batch-d-4.csv shane@mediabridgellc.com 0 \n",
+ "1014169 batch-b-5.csv shane@mediabridgellc.com 0 \n",
+ "\n",
+ " email_address email_domain \\\n",
+ "3236533 44a8867c-3332-403f-9b34-560c054bd728 gmail.com \n",
+ "2723370 f9ad6e74-115c-4ccd-8a4e-f1e408423942 icloud.com \n",
+ "56337 ad374006-bbed-4c8b-932d-e7dfacce1a29 aol.com \n",
+ "592618 c16e61f3-bd1f-4e50-95dd-830f5a219543 gmail.com \n",
+ "1014169 b252ca16-b5a2-4c61-9034-6e365bec0beb gmail.com \n",
+ "\n",
+ " name_and_location \n",
+ "3236533 058cbe31-9c92-4509-a8d9-50f84a1cf1ae \n",
+ "2723370 99c87532-c65e-42fa-97bd-0438d3ff504c \n",
+ "56337 6752ca9d-c848-4f3d-a66c-49073afe2458 \n",
+ "592618 ebccbc49-fa14-404f-bf83-b0ef00d48e78 \n",
+ "1014169 bc102bc2-d454-44a4-974d-d0b1a377f392 "
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_17_108_mb = (\n",
+ " bulk_uploads_17_108\n",
+ " .loc[lambda df: df[\"uploader\"] == \"shane@mediabridgellc.com\"]\n",
+ " .pipe(\n",
+ " create_sample,\n",
+ " n = 10000,\n",
+ " random_state = RANDOM_STATE + 1, # +1 so that we have an independent sample \n",
+ " )\n",
+ ")\n",
+ "\n",
+ "sample_17_108_mb.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " breach | \n",
+ " email_address | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Edmodo | \n",
+ " 44a8867c-3332-403f-9b34-560c054bd728 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ModernBusinessSolutions | \n",
+ " 44a8867c-3332-403f-9b34-560c054bd728 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " RiverCityMedia | \n",
+ " 44a8867c-3332-403f-9b34-560c054bd728 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " SpecialKSpamList | \n",
+ " 44a8867c-3332-403f-9b34-560c054bd728 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " VerificationsIO | \n",
+ " 44a8867c-3332-403f-9b34-560c054bd728 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " breach email_address\n",
+ "0 Edmodo 44a8867c-3332-403f-9b34-560c054bd728\n",
+ "1 ModernBusinessSolutions 44a8867c-3332-403f-9b34-560c054bd728\n",
+ "2 RiverCityMedia 44a8867c-3332-403f-9b34-560c054bd728\n",
+ "3 SpecialKSpamList 44a8867c-3332-403f-9b34-560c054bd728\n",
+ "4 VerificationsIO 44a8867c-3332-403f-9b34-560c054bd728"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "breaches_17_108_mb = (\n",
+ " sample_17_108_mb\n",
+ " .pipe(get_breaches, \"../data/breaches-17-108-mb-sample.csv\")\n",
+ ")\n",
+ "\n",
+ "breaches_17_108_mb.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " uploader | \n",
+ " breach | \n",
+ " rate | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " shane@mediabridgellc.com | \n",
+ " ModernBusinessSolutions | \n",
+ " 0.9388 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " shane@mediabridgellc.com | \n",
+ " RiverCityMedia | \n",
+ " 0.8277 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " shane@mediabridgellc.com | \n",
+ " VerificationsIO | \n",
+ " 0.7651 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " shane@mediabridgellc.com | \n",
+ " Collection1 | \n",
+ " 0.2574 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " shane@mediabridgellc.com | \n",
+ " Exactis | \n",
+ " 0.2571 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " shane@mediabridgellc.com | \n",
+ " MySpace | \n",
+ " 0.1968 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " shane@mediabridgellc.com | \n",
+ " AntiPublic | \n",
+ " 0.1956 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " shane@mediabridgellc.com | \n",
+ " SpecialKSpamList | \n",
+ " 0.1946 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " shane@mediabridgellc.com | \n",
+ " OnlinerSpambot | \n",
+ " 0.1941 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " shane@mediabridgellc.com | \n",
+ " ExploitIn | \n",
+ " 0.1826 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " uploader breach rate\n",
+ "0 shane@mediabridgellc.com ModernBusinessSolutions 0.9388\n",
+ "1 shane@mediabridgellc.com RiverCityMedia 0.8277\n",
+ "2 shane@mediabridgellc.com VerificationsIO 0.7651\n",
+ "3 shane@mediabridgellc.com Collection1 0.2574\n",
+ "4 shane@mediabridgellc.com Exactis 0.2571\n",
+ "5 shane@mediabridgellc.com MySpace 0.1968\n",
+ "6 shane@mediabridgellc.com AntiPublic 0.1956\n",
+ "7 shane@mediabridgellc.com SpecialKSpamList 0.1946\n",
+ "8 shane@mediabridgellc.com OnlinerSpambot 0.1941\n",
+ "9 shane@mediabridgellc.com ExploitIn 0.1826"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " calculate_breach_rates(\n",
+ " sample_17_108_mb,\n",
+ " breaches_17_108_mb,\n",
+ " )\n",
+ " .head(10)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Comparing MBS breach status to Docket 16-42 overlap"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " isin_mbs | \n",
+ " isin_16 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3236533 | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 2723370 | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 56337 | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 592618 | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 1014169 | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " isin_mbs isin_16\n",
+ "3236533 True False\n",
+ "2723370 True False\n",
+ "56337 True False\n",
+ "592618 True False\n",
+ "1014169 True False"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_17_108_mb_comparison = (\n",
+ " sample_17_108_mb \n",
+ " .assign(\n",
+ " isin_mbs = lambda df: (\n",
+ " df\n",
+ " [\"email_address\"].isin(\n",
+ " breaches_17_108_mb\n",
+ " .loc[lambda df: df[\"breach\"] == \"ModernBusinessSolutions\"]\n",
+ " [\"email_address\"]\n",
+ " )\n",
+ " ),\n",
+ " isin_16 = lambda df: (\n",
+ " df\n",
+ " [\"name_and_location\"]\n",
+ " .isin(\n",
+ " comments_16_42_classified\n",
+ " .loc[lambda df: df[\"group\"] == \"American Commitment\"]\n",
+ " [\"name_and_location\"]\n",
+ " )\n",
+ " )\n",
+ " )\n",
+ " [[\n",
+ " \"isin_mbs\",\n",
+ " \"isin_16\",\n",
+ " ]]\n",
+ ")\n",
+ "\n",
+ "sample_17_108_mb_comparison.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Matrix of the 10,000-comment sample, by whether the email address exists in the Modern Business Solutions breach and whether the exact contact information shows up in the Docket 16-42 comments that used American Commitment's language:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " isin_16 | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " isin_mbs | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " False | \n",
+ " 35 | \n",
+ " 577 | \n",
+ "
\n",
+ " \n",
+ " True | \n",
+ " 9287 | \n",
+ " 101 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "isin_16 False True \n",
+ "isin_mbs \n",
+ "False 35 577\n",
+ "True 9287 101"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " sample_17_108_mb_comparison\n",
+ " .groupby([\n",
+ " \"isin_mbs\",\n",
+ " \"isin_16\",\n",
+ " ])\n",
+ " .size()\n",
+ " .unstack()\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Among comments whose email addresses do *not* appear in MBS, this is the proportion that use exactly the same contact information as in the Docket 16-42 comments using American Commitment's language:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.9428"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " sample_17_108_mb_comparison\n",
+ " .loc[lambda df: df[\"isin_mbs\"] == False]\n",
+ " [\"isin_16\"]\n",
+ " .mean()\n",
+ " .round(4)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Among comments whose email addresses use exactly the same contact information as in the Docket 16-42 comments using American Commitment's language, this is the proportion of email addresses that appear in MBS:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.149"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " sample_17_108_mb_comparison\n",
+ " .loc[lambda df: df[\"isin_16\"] == True]\n",
+ " [\"isin_mbs\"]\n",
+ " .mean()\n",
+ " .round(4)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This is the proportion of comments that *either* are attributed to email addresses that appear in the Modern Business Solutions breach *or* use exactly the same contact information as in the Docket 16-42 comments using American Commitment's language:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.9965"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " sample_17_108_mb_comparison\n",
+ " [[\n",
+ " \"isin_mbs\",\n",
+ " \"isin_16\",\n",
+ " ]]\n",
+ " .sum(axis = 1)\n",
+ " .pipe(lambda x: x > 0)\n",
+ " .mean()\n",
+ " .round(4)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "---\n",
+ "\n",
+ "---"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.4"
+ },
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "state": {
+ "04f19153d45345bea122f0226fd113c0": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HBoxModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_838c964bbd7f4c8d9f2c6c74101b7910",
+ "IPY_MODEL_559e9ca4781549da93712e2012813f5e"
+ ],
+ "layout": "IPY_MODEL_337a0b4574d94a47ab32d58f8a7a5c61"
+ }
+ },
+ "0b009166dfde4b37950925f2a90cb56f": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "IntProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "IntProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "ProgressView",
+ "bar_style": "success",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_57f0be729dfb476c82ef0deb5203147f",
+ "max": 10000,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_0d53cbe8cfc742329b594e489cce0177",
+ "value": 10000
+ }
+ },
+ "0d53cbe8cfc742329b594e489cce0177": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "bar_color": null,
+ "description_width": ""
+ }
+ },
+ "0d59382dc9a741a0877f290ffae90364": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "bar_color": null,
+ "description_width": ""
+ }
+ },
+ "23b58dd86771493a96c5a267be77f946": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "285330ed3c324f5d912879140bfa7e4e": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HBoxModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_8468b9602d9f45eb8fac9eafaf2cc596",
+ "IPY_MODEL_bdf6f69aeeb145dd8ce850b2624b1ca7"
+ ],
+ "layout": "IPY_MODEL_6af89b4e9c64425593860aecdde16787"
+ }
+ },
+ "2a49898891a14de69657f79c88ec06a0": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HBoxModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_31d73c99639d4336a842a3711be1de7b",
+ "IPY_MODEL_bc97f74081834de09453bb2fbc424e53"
+ ],
+ "layout": "IPY_MODEL_2e816c2f00c241599adbc09f90286f2f"
+ }
+ },
+ "2cf0ec36c28146b7b7911d8115fd9c08": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "2e816c2f00c241599adbc09f90286f2f": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "31d73c99639d4336a842a3711be1de7b": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "IntProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "IntProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "ProgressView",
+ "bar_style": "success",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_afa3bc7dc3ee45758b0d9edbae514f5a",
+ "max": 8000,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_96183dd411c941fe95ee9fdf27b53e34",
+ "value": 8000
+ }
+ },
+ "337a0b4574d94a47ab32d58f8a7a5c61": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "3f112798212e438c835992489e901c0c": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_23b58dd86771493a96c5a267be77f946",
+ "placeholder": "",
+ "style": "IPY_MODEL_44767ed751bf416e972ed642a3a5244c",
+ "value": "100% 10000/10000 [5:22:12<00:00, 1.93s/it]"
+ }
+ },
+ "44767ed751bf416e972ed642a3a5244c": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "559e9ca4781549da93712e2012813f5e": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_87ae1a7a3c9b48f79f7a1cb1652f613c",
+ "placeholder": "",
+ "style": "IPY_MODEL_c2a3af8a55c14d5fac09fd5d3670b9d6",
+ "value": "100% 2000/2000 [00:11<00:00, 169.13it/s]"
+ }
+ },
+ "57f0be729dfb476c82ef0deb5203147f": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "6af89b4e9c64425593860aecdde16787": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "77429d4856194b3083b80bf4479caea2": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "7c6c63382e934b509d42b2f9c6f14540": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "805573d19962489a9ae73b224640c32d": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "838c964bbd7f4c8d9f2c6c74101b7910": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "IntProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "IntProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "ProgressView",
+ "bar_style": "success",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_edc354a8cb6b4c93ab80301bbbebffa6",
+ "max": 2000,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_bf299a0f514048aa9b25df89637fa3bb",
+ "value": 2000
+ }
+ },
+ "8468b9602d9f45eb8fac9eafaf2cc596": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "IntProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "IntProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "ProgressView",
+ "bar_style": "success",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_9163fc6bd9c3420bb8bc911c78eb317c",
+ "max": 10000,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_e8c99f52be4b41e2b9b0421d46941707",
+ "value": 10000
+ }
+ },
+ "87ae1a7a3c9b48f79f7a1cb1652f613c": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "8cb5cf5fa780495ca62e9dd284a39539": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "9163fc6bd9c3420bb8bc911c78eb317c": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "96183dd411c941fe95ee9fdf27b53e34": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "bar_color": null,
+ "description_width": ""
+ }
+ },
+ "af021a898708423b88f8418f69fad55e": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HBoxModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_0b009166dfde4b37950925f2a90cb56f",
+ "IPY_MODEL_3f112798212e438c835992489e901c0c"
+ ],
+ "layout": "IPY_MODEL_fd06e30d7c514c66a23889f9cee4c7f6"
+ }
+ },
+ "afa3bc7dc3ee45758b0d9edbae514f5a": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "b39529fe3149414a8643320754f56f5f": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HBoxModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_b7bd33b176c04f8fb4dc339a4d7ba001",
+ "IPY_MODEL_ea365edc947e4ffaafe4ade2c060d3a8"
+ ],
+ "layout": "IPY_MODEL_d44549eeeb85491fb80605c67ad820a4"
+ }
+ },
+ "b7bd33b176c04f8fb4dc339a4d7ba001": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "IntProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "IntProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "ProgressView",
+ "bar_style": "success",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_77429d4856194b3083b80bf4479caea2",
+ "max": 10000,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_0d59382dc9a741a0877f290ffae90364",
+ "value": 10000
+ }
+ },
+ "bc97f74081834de09453bb2fbc424e53": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_2cf0ec36c28146b7b7911d8115fd9c08",
+ "placeholder": "",
+ "style": "IPY_MODEL_7c6c63382e934b509d42b2f9c6f14540",
+ "value": "100% 8000/8000 [00:48<00:00, 163.55it/s]"
+ }
+ },
+ "bdf6f69aeeb145dd8ce850b2624b1ca7": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_805573d19962489a9ae73b224640c32d",
+ "placeholder": "",
+ "style": "IPY_MODEL_f4dc764a214645daad6e3a8bc0ca5db3",
+ "value": "100% 10000/10000 [00:58<00:00, 169.89it/s]"
+ }
+ },
+ "bf299a0f514048aa9b25df89637fa3bb": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "bar_color": null,
+ "description_width": ""
+ }
+ },
+ "c2a3af8a55c14d5fac09fd5d3670b9d6": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "d44549eeeb85491fb80605c67ad820a4": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "e8c99f52be4b41e2b9b0421d46941707": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "bar_color": null,
+ "description_width": ""
+ }
+ },
+ "ea365edc947e4ffaafe4ade2c060d3a8": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_8cb5cf5fa780495ca62e9dd284a39539",
+ "placeholder": "",
+ "style": "IPY_MODEL_eda854ec06cc44fea21cab1c9b6e0e16",
+ "value": "100% 10000/10000 [00:59<00:00, 168.73it/s]"
+ }
+ },
+ "eda854ec06cc44fea21cab1c9b6e0e16": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "edc354a8cb6b4c93ab80301bbbebffa6": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "f4dc764a214645daad6e3a8bc0ca5db3": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "fd06e30d7c514c66a23889f9cee4c7f6": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ }
+ },
+ "version_major": 2,
+ "version_minor": 0
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/analyze-mb-comment-structure.ipynb b/notebooks/analyze-mb-comment-structure.ipynb
new file mode 100644
index 0000000..99aaf69
--- /dev/null
+++ b/notebooks/analyze-mb-comment-structure.ipynb
@@ -0,0 +1,2267 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Analyzing the structure of Media Bridge–submitted comments\n",
+ "\n",
+ "This notebook analyzes the comments uploaded by Media Bridge to FCC Docket 17-108, with a focus on understanding the structure behind the algorithmically-generated ones."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Load the comments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import re\n",
+ "import json\n",
+ "import math\n",
+ "from functools import reduce"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Media Bridge uploaded 1.9 million comments in total:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1856553"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mb_comments = (\n",
+ " pd.read_csv(\n",
+ " \"../data/bulk-uploads-17-108-with-uuids.csv\",\n",
+ " usecols = [ \"uploader\", \"comments\", \"email_address\" ],\n",
+ " dtype = str,\n",
+ " )\n",
+ " .loc[lambda df: df[\"uploader\"] == \"shane@mediabridgellc.com\"]\n",
+ " .assign(\n",
+ " comments = lambda df: df[\"comments\"].str.replace(u\"\\xa0\", \" \")\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "len(mb_comments)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Some, however, are duplicates. There are 1.5 million unique comments, where uniqueness is defined as the combination of the comment text and the email address associated with the comment:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1501759"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mb_deduped = (\n",
+ " mb_comments\n",
+ " .drop_duplicates(subset = [ \"comments\", \"email_address\" ])\n",
+ ")\n",
+ "\n",
+ "len(mb_deduped)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Separate randomized vs. non-randomized comments\n",
+ "\n",
+ "About 472,000 of the comments have no internal randomization; they come from one of five pre-written variations. (One of those five has two sub-variations that differ only in formattng; as a result, there are six strings listed below.)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "non_randomized = [\n",
+ " \"The Title II order created a gaping gap in privacy protections by taking the best cop, the FTC, off the beat. That is reason enough to support Chairman Pai's proposal to restore Internet freedom. Restore privacy by repealing Net Neutrality.\",\n",
+ " \"Title II is a Depression-era regulatory framework designed for a telephone monopoly that no longer exists. It was wrong to apply it to the Internet and the FCC should repeal it and go back to the free-market approach that worked so well.\",\n",
+ " \"The free-market Internet was an incredible engine of economic growth, innovation, and job creation since the 1990s and has already been substantially slowed by the 2015 Net Neutrality rules. The slowdown in investment is destroying jobs and risks a big future tax hike to make up for lost private investment. Save American jobs by repealing Net Neutrality.\",\n",
+ " \"The FCC's Net Neutrality rules were written in the Obama White House by political staff and Tech Industry special interests who overruled the FCC's own experts. The FCC's own chief economist Tim Brennan called the rules \\\"an economics-free zone.\\\" They should be repealed.\",\n",
+ " \"Obama's Net Neutrality order was the corrupt result of a corrupt process controlled by Silicon Valley special interests. It gives some of the biggest companies in the world a free ride at the expense of consumers and should be immediately repealed!\",\n",
+ " ' \"Obama\\'s Net Neutrality order was the corrupt result of a corrupt process controlled by Silicon Valley special interests. It gives some of the biggest companies in the world a free ride at the expense of consumers and should be immediately repealed!\"',\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "471677"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mb_deduped_nonrandom = (\n",
+ " mb_deduped\n",
+ " .loc[lambda df: df[\"comments\"].isin(non_randomized)]\n",
+ ")\n",
+ "\n",
+ "len(mb_deduped_nonrandom)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Title II is a Depression-era regulatory framework designed for a telephone monopoly that no longer exists. It was wrong to apply it to the Internet and the FCC should repeal it and go back to the free-market approach that worked so well. | \n",
+ " 127501 | \n",
+ "
\n",
+ " \n",
+ " The Title II order created a gaping gap in privacy protections by taking the best cop, the FTC, off the beat. That is reason enough to support Chairman Pai's proposal to restore Internet freedom. Restore privacy by repealing Net Neutrality. | \n",
+ " 92884 | \n",
+ "
\n",
+ " \n",
+ " The free-market Internet was an incredible engine of economic growth, innovation, and job creation since the 1990s and has already been substantially slowed by the 2015 Net Neutrality rules. The slowdown in investment is destroying jobs and risks a big future tax hike to make up for lost private investment. Save American jobs by repealing Net Neutrality. | \n",
+ " 83072 | \n",
+ "
\n",
+ " \n",
+ " Obama's Net Neutrality order was the corrupt result of a corrupt process controlled by Silicon Valley special interests. It gives some of the biggest companies in the world a free ride at the expense of consumers and should be immediately repealed! | \n",
+ " 74809 | \n",
+ "
\n",
+ " \n",
+ " The FCC's Net Neutrality rules were written in the Obama White House by political staff and Tech Industry special interests who overruled the FCC's own experts. The FCC's own chief economist Tim Brennan called the rules \"an economics-free zone.\" They should be repealed. | \n",
+ " 62635 | \n",
+ "
\n",
+ " \n",
+ " \"Obama's Net Neutrality order was the corrupt result of a corrupt process controlled by Silicon Valley special interests. It gives some of the biggest companies in the world a free ride at the expense of consumers and should be immediately repealed!\" | \n",
+ " 30776 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " count\n",
+ "Title II is a Depression-era regulatory framewo... 127501\n",
+ "The Title II order created a gaping gap in priv... 92884\n",
+ "The free-market Internet was an incredible engi... 83072\n",
+ "Obama's Net Neutrality order was the corrupt re... 74809\n",
+ "The FCC's Net Neutrality rules were written in ... 62635\n",
+ " \"Obama's Net Neutrality order was the corrupt ... 30776"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " mb_deduped_nonrandom\n",
+ " [\"comments\"]\n",
+ " .value_counts()\n",
+ " .to_frame(\"count\")\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The remaining 1 million comments are, at least on their surface, unique: No two are exactly the same."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1030082"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mb_deduped_random = (\n",
+ " mb_deduped\n",
+ " .loc[lambda df: ~df[\"comments\"].isin(non_randomized)]\n",
+ ")\n",
+ "\n",
+ "len(mb_deduped_random)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# If two or more comments were the same, this cell would throw an error\n",
+ "assert mb_deduped_random[\"comments\"].value_counts().max() == 1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Examples:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Dear Chairman Pai, I would like to comment on Internet regulation. I strongly recommend Chairman Pai to repeal Obama's scheme to regulate the web. Americans, as opposed to Washington bureaucrats, should purchase the products they prefer. Obama's scheme to regulate the web is a betrayal of the open Internet. It stopped a free-market system that functioned supremely well for decades with broad bipartisan backing.\n",
+ "\n",
+ "To the Federal Communications Commission: I'm concerned about network neutrality regulations. I'd like to request the government to undo The previous administration's order to control the web. Individual citizens, not the FCC, should enjoy whatever products they desire. The previous administration's order to control the web is a exploitation of net neutrality. It broke a market-based framework that functioned remarkably smoothly for many years with nearly universal backing.\n",
+ "\n",
+ "Chairman Pai: My comments re: regulations on the Internet. I'd like to suggest Ajit Pai to rescind Obama's scheme to take over the Internet. Internet users, rather than the FCC, should be free to purchase the products they choose. Obama's scheme to take over the Internet is a corruption of the open Internet. It stopped a free-market system that functioned very, very smoothly for decades with both parties' approval.\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"\\n\\n\".join(\n",
+ " mb_deduped_random\n",
+ " [\"comments\"]\n",
+ " .sample(3, random_state = 0)\n",
+ "))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Reverse-engineer the structure of the randomized comments\n",
+ "\n",
+ "The following code represents BuzzFeed News' best estimate of how the randomized comments were generated.\n",
+ "\n",
+ "Each sub-list contains the possible variations, which appear to be selected (with equal weighting) at random. Sub-lists with only one item are \"fixed\"; they don't change from comment to comment.\n",
+ "\n",
+ "One exception is a repeated phrase at the beginning of the fourth sentence of each comment; it repeats whatever happens to have been randomly selected in a particular part of the second sentence. More details on that below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "segments = [\n",
+ " [\n",
+ " \"To whom it may concern: \",\n",
+ " \"To the Federal Communications Commission: \",\n",
+ " \"FCC: \",\n",
+ " \"To the FCC: \",\n",
+ " \"Dear Commissioners: \",\n",
+ " \"Dear Mr. Pai, \",\n",
+ " \"Dear Chairman Pai, \",\n",
+ " \"Dear FCC, \",\n",
+ " \"Mr Pai: \",\n",
+ " \"FCC commissioners, \",\n",
+ " \"Chairman Pai: \",\n",
+ " \"\",\n",
+ " ],\n",
+ "\n",
+ " [\n",
+ " \"I'm concerned about\",\n",
+ " \"I am concerned about\",\n",
+ " \"I have concerns about\",\n",
+ " \"I'm very concerned about\",\n",
+ " \"I'd like to share my thoughts on\",\n",
+ " \"Hi, I'd like to comment on\",\n",
+ " \"I would like to comment on\",\n",
+ " \"I want to give my opinion on\",\n",
+ " \"I have thoughts on\",\n",
+ " \"I'm contacting you about\",\n",
+ " \"I'm very worried about\",\n",
+ " \"My comments re:\",\n",
+ " \"In reference to\",\n",
+ " \"I am a voter worried about\",\n",
+ " \"I'm a voter worried about\",\n",
+ " \"Regarding\",\n",
+ " \"With respect to\",\n",
+ " \"In the matter of\",\n",
+ " ],\n",
+ "\n",
+ " [ \" \" ],\n",
+ " \n",
+ " [\n",
+ " \"the FCC's so-called Open Internet order\",\n",
+ " \"Internet regulation and net neutrality\",\n",
+ " \"the Obama takeover of the Internet\", \n",
+ " \"the FCC regulations on the Internet\",\n",
+ " \"network neutrality regulations\",\n",
+ " \"the FCC's Open Internet order\",\n",
+ " \"the FCC rules on the Internet\",\n",
+ " \"net neutrality and Title II\",\n",
+ " \"Net Neutrality and Title II\",\n",
+ " \"regulations on the Internet\",\n",
+ " \"restoring Internet freedom\",\n",
+ " \"net neutrality regulations\",\n",
+ " \"Title 2 and net neutrality\",\n",
+ " \"the future of the Internet\",\n",
+ " \"the Open Internet order\",\n",
+ " \"internet regulations\",\n",
+ " \"net neutrality rules\",\n",
+ " \"Internet regulation\",\n",
+ " \"Network Neutrality\",\n",
+ " \"an open Internet\",\n",
+ " \"Internet freedom\",\n",
+ " \"Internet Freedom\",\n",
+ " \"Net neutrality\",\n",
+ " \"net neutrality\",\n",
+ " \"NET NEUTRALITY\",\n",
+ " \"Title II rules\",\n",
+ " ],\n",
+ " \n",
+ " [ \". I\" ],\n",
+ "\n",
+ " [\n",
+ " \"'d like to\",\n",
+ " \" would like to\",\n",
+ " \" want to\",\n",
+ " \" strongly\",\n",
+ " \"\",\n",
+ " ],\n",
+ " \n",
+ " [\n",
+ " \" \"\n",
+ " ],\n",
+ " \n",
+ " [\n",
+ " \"implore\",\n",
+ " \"ask\",\n",
+ " \"request\",\n",
+ " \"urge\",\n",
+ " \"encourage\",\n",
+ " \"recommend\",\n",
+ " \"suggest\",\n",
+ " \"demand\",\n",
+ " \"advocate\",\n",
+ " ],\n",
+ " \n",
+ " [ \" \" ],\n",
+ "\n",
+ " [\n",
+ " \"you\",\n",
+ " \"the FCC\",\n",
+ " \"the Federal Communications Commission\",\n",
+ " \"the commissioners\",\n",
+ " \"the commission\",\n",
+ " \"Chairman Pai\",\n",
+ " \"Ajit Pai\",\n",
+ " \"the government\"\n",
+ " ],\n",
+ " \n",
+ " [ \" to \" ],\n",
+ " \n",
+ " [\n",
+ " \"undo\",\n",
+ " \"reverse\",\n",
+ " \"repeal\",\n",
+ " \"overturn\",\n",
+ " \"rescind\",\n",
+ " ],\n",
+ "\n",
+ " [ \" \" ],\n",
+ " \n",
+ " [\n",
+ " \"The previous administration's\",\n",
+ " \"The Obama/Wheeler\",\n",
+ " \"President Obama's\",\n",
+ " \"Barack Obama's\",\n",
+ " \"Tom Wheeler's\",\n",
+ " \"Obama's\",\n",
+ " ],\n",
+ "\n",
+ " [ \" \" ],\n",
+ " \n",
+ " [\n",
+ " \"decision\",\n",
+ " \"scheme\",\n",
+ " \"policy\",\n",
+ " \"order\",\n",
+ " \"power grab\",\n",
+ " \"plan\",\n",
+ " ],\n",
+ " \n",
+ " [ \" to \" ],\n",
+ " \n",
+ " [\n",
+ " \"regulate\",\n",
+ " \"control\",\n",
+ " \"take over\",\n",
+ " ],\n",
+ "\n",
+ " [ \" \" ],\n",
+ "\n",
+ " \n",
+ " [\n",
+ " \"broadband\",\n",
+ " \"the web\",\n",
+ " \"Internet access\",\n",
+ " \"the Internet\",\n",
+ " ],\n",
+ " \n",
+ " [ \". \" ],\n",
+ " \n",
+ " [\n",
+ " \"Internet users\",\n",
+ " \"Individual citizens\",\n",
+ " \"People like me\",\n",
+ " \"Citizens\",\n",
+ " \"Individual Americans\",\n",
+ " \"Americans\",\n",
+ " \"Individuals\",\n",
+ " ],\n",
+ " \n",
+ " [ \", \" ],\n",
+ " \n",
+ " [\n",
+ " \"rather than\",\n",
+ " \"as opposed to\",\n",
+ " \"not\",\n",
+ " ],\n",
+ " \n",
+ " [ \" \" ],\n",
+ " \n",
+ " [\n",
+ " \"Washington bureaucrats\",\n",
+ " \"Washington\",\n",
+ " \"big government\",\n",
+ " \"so-called experts\",\n",
+ " \"unelected bureaucrats\",\n",
+ " \"the FCC Enforcement Bureau\",\n",
+ " \"the FCC\",\n",
+ " ],\n",
+ " \n",
+ " [ \", \" ],\n",
+ " \n",
+ " [\n",
+ " \"should be able to\",\n",
+ " \"should be empowered to\",\n",
+ " \"should be free to\",\n",
+ " \"ought to\",\n",
+ " \"deserve to\",\n",
+ " \"should\",\n",
+ " ],\n",
+ " \n",
+ " [\n",
+ " \" \",\n",
+ " ],\n",
+ " \n",
+ " [\n",
+ " \"use\",\n",
+ " \"enjoy\",\n",
+ " \"purchase\",\n",
+ " \"buy\",\n",
+ " \"select\",\n",
+ " ],\n",
+ " \n",
+ " [ \" \" ],\n",
+ " \n",
+ " [\n",
+ " \"the\",\n",
+ " \"whichever\",\n",
+ " \"whatever\",\n",
+ " \"which\",\n",
+ " ],\n",
+ " \n",
+ " [ \" \" ],\n",
+ " \n",
+ " [\n",
+ " \"products\",\n",
+ " \"applications\",\n",
+ " \"services\",\n",
+ " ],\n",
+ " \n",
+ " [ \" \" ],\n",
+ "\n",
+ " [\n",
+ " \"they\",\n",
+ " \"we\",\n",
+ " ],\n",
+ " \n",
+ " [ \" \" ],\n",
+ " \n",
+ " [\n",
+ " \"want\",\n",
+ " \"desire\",\n",
+ " \"prefer\",\n",
+ " \"choose\",\n",
+ " ],\n",
+ " \n",
+ " [ \". \" ],\n",
+ " \n",
+ " [\n",
+ " \"The previous administration's\",\n",
+ " \"The Obama/Wheeler\",\n",
+ " \"President Obama's\",\n",
+ " \"Barack Obama's\",\n",
+ " \"Tom Wheeler's\",\n",
+ " \"Obama's\",\n",
+ " ],\n",
+ "\n",
+ " [ \" \" ],\n",
+ " \n",
+ " [\n",
+ " \"decision\",\n",
+ " \"scheme\",\n",
+ " \"policy\",\n",
+ " \"order\",\n",
+ " \"power grab\",\n",
+ " \"plan\",\n",
+ " ],\n",
+ " \n",
+ " [ \" to \" ],\n",
+ " \n",
+ " [\n",
+ " \"regulate\",\n",
+ " \"control\",\n",
+ " \"take over\",\n",
+ " ],\n",
+ " \n",
+ " [ \" \" ],\n",
+ " \n",
+ " [\n",
+ " \"broadband\",\n",
+ " \"the web\",\n",
+ " \"Internet access\",\n",
+ " \"the Internet\",\n",
+ " ],\n",
+ " \n",
+ " [ \" is a \" ],\n",
+ " \n",
+ " [\n",
+ " \"exploitation \",\n",
+ " \"distortion\",\n",
+ " \"perversion\",\n",
+ " \"corruption\",\n",
+ " \"betrayal\",\n",
+ " ],\n",
+ " \n",
+ " [ \" of \" ],\n",
+ " \n",
+ " [\n",
+ " \"net neutrality\",\n",
+ " \"the open Internet\",\n",
+ " ],\n",
+ " \n",
+ " [ \". It \" ],\n",
+ " \n",
+ " [\n",
+ " \"disrupted\",\n",
+ " \"undid\",\n",
+ " \"reversed\",\n",
+ " \"ended\",\n",
+ " \"broke\",\n",
+ " \"stopped\",\n",
+ " ],\n",
+ " \n",
+ " [ \" a \" ],\n",
+ " \n",
+ " [\n",
+ " \"light-touch\",\n",
+ " \"pro-consumer\",\n",
+ " \"hands-off\",\n",
+ " \"free-market\",\n",
+ " \"market-based\",\n",
+ " ],\n",
+ " \n",
+ " [ \" \" ],\n",
+ " \n",
+ " [\n",
+ " \"policy\",\n",
+ " \"system\",\n",
+ " \"approach\",\n",
+ " \"framework\",\n",
+ " ],\n",
+ " \n",
+ " [ \" that \" ],\n",
+ " \n",
+ " [\n",
+ " \"functioned\",\n",
+ " \"performed\",\n",
+ " \"worked\",\n",
+ " ],\n",
+ " \n",
+ " [ \" \" ],\n",
+ " \n",
+ " [\n",
+ " \"supremely\",\n",
+ " \"very, very\",\n",
+ " \"very\",\n",
+ " \"remarkably\",\n",
+ " \"fabulously\",\n",
+ " \"exceptionally\",\n",
+ " ],\n",
+ " \n",
+ " [ \" \" ],\n",
+ " \n",
+ " [\n",
+ " \"well\",\n",
+ " \"successfully\",\n",
+ " \"smoothly\",\n",
+ " ],\n",
+ " \n",
+ " [ \" for \" ],\n",
+ " \n",
+ " [\n",
+ " \"many years\",\n",
+ " \"decades\",\n",
+ " \"a long time\",\n",
+ " \"two decades\",\n",
+ " ],\n",
+ " \n",
+ " [ \" with \" ],\n",
+ " \n",
+ " [\n",
+ " \"nearly universal\",\n",
+ " \"broad bipartisan\",\n",
+ " \"bipartisan\",\n",
+ " \"both parties'\",\n",
+ " \"Republican and Democrat\",\n",
+ " ],\n",
+ "\n",
+ " [ \" \" ],\n",
+ " \n",
+ " [\n",
+ " \"support\",\n",
+ " \"consensus\",\n",
+ " \"approval\",\n",
+ " \"backing\",\n",
+ " ],\n",
+ " \n",
+ " [ \".\" ]\n",
+ " \n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Check that pattern fully matches comments\n",
+ "\n",
+ "Here, we compile the comment segments into a single regular expression, which we use to check whether comments match the reverse-engineered model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def segments_to_pattern(segments):\n",
+ " return re.compile(r\"^\" + r\"\".join(\n",
+ " r\"(\" + r\"|\".join(re.escape(option) for option in seg) + r\")\"\n",
+ " for seg in segments) + r\"$\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pattern = segments_to_pattern(segments)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "All comments match (otherwise, the result would be greater than zero):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sum(re.match(pattern, x) is None for x in mb_deduped_random[\"comments\"].values)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Check that there are no superfluous permutations\n",
+ "\n",
+ "Although the model above succeeds in matching all comments, so would a model that contained, for example, the entire English language. So here we check whether any individual part of the pattern is superfluous, by incrementally removing each one, and seeing whether the comments still match the pattern. (Here we use a random sample of comments, to speed up the process.)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sample_comments = (\n",
+ " mb_deduped_random\n",
+ " [\"comments\"]\n",
+ " .sample(1000, random_state = 0)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# A lack of output for this cell is a good thing;\n",
+ "# it means no part of the model is superfluous\n",
+ "\n",
+ "for i, segment in enumerate(segments):\n",
+ " # For each sub-part of the each segment ...\n",
+ " for j, option in enumerate(segment):\n",
+ " \n",
+ " # Replace the sub-part with \"###\", and then test\n",
+ " # whether the pattern-matching fails. It should fail;\n",
+ " # if it does not, then the sub-part is superfluous.\n",
+ " segments_copy = list([ list(o) for o in segments ])\n",
+ " segments_copy[i][j] = \"###\"\n",
+ " new_pattern = segments_to_pattern(segments_copy)\n",
+ " \n",
+ " num_nonmatching_comments = sum((re.match(new_pattern, x) is None)\n",
+ " for x in sample_comments.values)\n",
+ " \n",
+ " # If all of the comments still match after the \"###\" \n",
+ " # substitution, then the replaced sub-part isn't necessary\n",
+ " # to the model.\n",
+ " if num_nonmatching_comments == 0:\n",
+ " print(i, j, option)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Check that segments are randomized independently\n",
+ "\n",
+ "In some text-generation models, the value of one segment may influence the possible values (or weights for those values) of subsequent segments. Here, we check whether that appears to be true for the actual model that generated these comments.\n",
+ "\n",
+ "First, we extract the bits of text that each comment has used for each section, skipping the \"fixed\" segments. (Here again we use a random sample of comments, to speed things up.)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[2, 4, 6]"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "FIXED_SEGMENT_INDEX = [ i for i, x in enumerate(segments) if len(x) == 1 ]\n",
+ "FIXED_SEGMENT_INDEX[:3]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def extract_permutations(comment):\n",
+ " permutations = [ (i, g) for i, g in enumerate(re.match(pattern, comment).groups())\n",
+ " if i not in FIXED_SEGMENT_INDEX ]\n",
+ " \n",
+ " return pd.DataFrame(\n",
+ " permutations,\n",
+ " columns = [ \"seg_i\", \"option\" ], \n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Example, for the first comment in the sample:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " seg_i | \n",
+ " option | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Dear Chairman Pai, | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " I would like to comment on | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " Internet regulation | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 5 | \n",
+ " strongly | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 7 | \n",
+ " recommend | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 9 | \n",
+ " Chairman Pai | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 11 | \n",
+ " repeal | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 13 | \n",
+ " Obama's | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 15 | \n",
+ " scheme | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 17 | \n",
+ " regulate | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 19 | \n",
+ " the web | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 21 | \n",
+ " Americans | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 23 | \n",
+ " as opposed to | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 25 | \n",
+ " Washington bureaucrats | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 27 | \n",
+ " should | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 29 | \n",
+ " purchase | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 31 | \n",
+ " the | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 33 | \n",
+ " products | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 35 | \n",
+ " they | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 37 | \n",
+ " prefer | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 39 | \n",
+ " Obama's | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " 41 | \n",
+ " scheme | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 43 | \n",
+ " regulate | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " 45 | \n",
+ " the web | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " 47 | \n",
+ " betrayal | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " 49 | \n",
+ " the open Internet | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " 51 | \n",
+ " stopped | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " 53 | \n",
+ " free-market | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " 55 | \n",
+ " system | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " 57 | \n",
+ " functioned | \n",
+ "
\n",
+ " \n",
+ " 30 | \n",
+ " 59 | \n",
+ " supremely | \n",
+ "
\n",
+ " \n",
+ " 31 | \n",
+ " 61 | \n",
+ " well | \n",
+ "
\n",
+ " \n",
+ " 32 | \n",
+ " 63 | \n",
+ " decades | \n",
+ "
\n",
+ " \n",
+ " 33 | \n",
+ " 65 | \n",
+ " broad bipartisan | \n",
+ "
\n",
+ " \n",
+ " 34 | \n",
+ " 67 | \n",
+ " backing | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " seg_i option\n",
+ "0 0 Dear Chairman Pai, \n",
+ "1 1 I would like to comment on\n",
+ "2 3 Internet regulation\n",
+ "3 5 strongly\n",
+ "4 7 recommend\n",
+ "5 9 Chairman Pai\n",
+ "6 11 repeal\n",
+ "7 13 Obama's\n",
+ "8 15 scheme\n",
+ "9 17 regulate\n",
+ "10 19 the web\n",
+ "11 21 Americans\n",
+ "12 23 as opposed to\n",
+ "13 25 Washington bureaucrats\n",
+ "14 27 should\n",
+ "15 29 purchase\n",
+ "16 31 the\n",
+ "17 33 products\n",
+ "18 35 they\n",
+ "19 37 prefer\n",
+ "20 39 Obama's\n",
+ "21 41 scheme\n",
+ "22 43 regulate\n",
+ "23 45 the web\n",
+ "24 47 betrayal\n",
+ "25 49 the open Internet\n",
+ "26 51 stopped\n",
+ "27 53 free-market\n",
+ "28 55 system\n",
+ "29 57 functioned\n",
+ "30 59 supremely\n",
+ "31 61 well\n",
+ "32 63 decades\n",
+ "33 65 broad bipartisan\n",
+ "34 67 backing"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "extract_permutations(sample_comments.iloc[0])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Here, we create a DataFrame of all extracted segments:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " seg_i | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 5 | \n",
+ " 7 | \n",
+ " 9 | \n",
+ " 11 | \n",
+ " 13 | \n",
+ " 15 | \n",
+ " 17 | \n",
+ " ... | \n",
+ " 49 | \n",
+ " 51 | \n",
+ " 53 | \n",
+ " 55 | \n",
+ " 57 | \n",
+ " 59 | \n",
+ " 61 | \n",
+ " 63 | \n",
+ " 65 | \n",
+ " 67 | \n",
+ "
\n",
+ " \n",
+ " comment_i | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Dear Chairman Pai, | \n",
+ " I would like to comment on | \n",
+ " Internet regulation | \n",
+ " strongly | \n",
+ " recommend | \n",
+ " Chairman Pai | \n",
+ " repeal | \n",
+ " Obama's | \n",
+ " scheme | \n",
+ " regulate | \n",
+ " ... | \n",
+ " the open Internet | \n",
+ " stopped | \n",
+ " free-market | \n",
+ " system | \n",
+ " functioned | \n",
+ " supremely | \n",
+ " well | \n",
+ " decades | \n",
+ " broad bipartisan | \n",
+ " backing | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " To the Federal Communications Commission: | \n",
+ " I'm concerned about | \n",
+ " network neutrality regulations | \n",
+ " 'd like to | \n",
+ " request | \n",
+ " the government | \n",
+ " undo | \n",
+ " The previous administration's | \n",
+ " order | \n",
+ " control | \n",
+ " ... | \n",
+ " net neutrality | \n",
+ " broke | \n",
+ " market-based | \n",
+ " framework | \n",
+ " functioned | \n",
+ " remarkably | \n",
+ " smoothly | \n",
+ " many years | \n",
+ " nearly universal | \n",
+ " backing | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Chairman Pai: | \n",
+ " My comments re: | \n",
+ " regulations on the Internet | \n",
+ " 'd like to | \n",
+ " suggest | \n",
+ " Ajit Pai | \n",
+ " rescind | \n",
+ " Obama's | \n",
+ " scheme | \n",
+ " take over | \n",
+ " ... | \n",
+ " the open Internet | \n",
+ " stopped | \n",
+ " free-market | \n",
+ " system | \n",
+ " functioned | \n",
+ " very, very | \n",
+ " smoothly | \n",
+ " decades | \n",
+ " both parties' | \n",
+ " approval | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Dear Mr. Pai, | \n",
+ " Hi, I'd like to comment on | \n",
+ " the FCC rules on the Internet | \n",
+ " | \n",
+ " ask | \n",
+ " Ajit Pai | \n",
+ " reverse | \n",
+ " The Obama/Wheeler | \n",
+ " scheme | \n",
+ " regulate | \n",
+ " ... | \n",
+ " the open Internet | \n",
+ " reversed | \n",
+ " hands-off | \n",
+ " policy | \n",
+ " functioned | \n",
+ " remarkably | \n",
+ " smoothly | \n",
+ " many years | \n",
+ " Republican and Democrat | \n",
+ " consensus | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Mr Pai: | \n",
+ " I'm contacting you about | \n",
+ " the FCC's Open Internet order | \n",
+ " | \n",
+ " request | \n",
+ " the FCC | \n",
+ " repeal | \n",
+ " The Obama/Wheeler | \n",
+ " plan | \n",
+ " take over | \n",
+ " ... | \n",
+ " the open Internet | \n",
+ " reversed | \n",
+ " light-touch | \n",
+ " system | \n",
+ " performed | \n",
+ " very, very | \n",
+ " smoothly | \n",
+ " many years | \n",
+ " Republican and Democrat | \n",
+ " backing | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 35 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ "seg_i 0 \\\n",
+ "comment_i \n",
+ "0 Dear Chairman Pai, \n",
+ "1 To the Federal Communications Commission: \n",
+ "2 Chairman Pai: \n",
+ "3 Dear Mr. Pai, \n",
+ "4 Mr Pai: \n",
+ "\n",
+ "seg_i 1 3 \\\n",
+ "comment_i \n",
+ "0 I would like to comment on Internet regulation \n",
+ "1 I'm concerned about network neutrality regulations \n",
+ "2 My comments re: regulations on the Internet \n",
+ "3 Hi, I'd like to comment on the FCC rules on the Internet \n",
+ "4 I'm contacting you about the FCC's Open Internet order \n",
+ "\n",
+ "seg_i 5 7 9 11 \\\n",
+ "comment_i \n",
+ "0 strongly recommend Chairman Pai repeal \n",
+ "1 'd like to request the government undo \n",
+ "2 'd like to suggest Ajit Pai rescind \n",
+ "3 ask Ajit Pai reverse \n",
+ "4 request the FCC repeal \n",
+ "\n",
+ "seg_i 13 15 17 ... \\\n",
+ "comment_i ... \n",
+ "0 Obama's scheme regulate ... \n",
+ "1 The previous administration's order control ... \n",
+ "2 Obama's scheme take over ... \n",
+ "3 The Obama/Wheeler scheme regulate ... \n",
+ "4 The Obama/Wheeler plan take over ... \n",
+ "\n",
+ "seg_i 49 51 53 55 57 \\\n",
+ "comment_i \n",
+ "0 the open Internet stopped free-market system functioned \n",
+ "1 net neutrality broke market-based framework functioned \n",
+ "2 the open Internet stopped free-market system functioned \n",
+ "3 the open Internet reversed hands-off policy functioned \n",
+ "4 the open Internet reversed light-touch system performed \n",
+ "\n",
+ "seg_i 59 61 63 65 \\\n",
+ "comment_i \n",
+ "0 supremely well decades broad bipartisan \n",
+ "1 remarkably smoothly many years nearly universal \n",
+ "2 very, very smoothly decades both parties' \n",
+ "3 remarkably smoothly many years Republican and Democrat \n",
+ "4 very, very smoothly many years Republican and Democrat \n",
+ "\n",
+ "seg_i 67 \n",
+ "comment_i \n",
+ "0 backing \n",
+ "1 backing \n",
+ "2 approval \n",
+ "3 consensus \n",
+ "4 backing \n",
+ "\n",
+ "[5 rows x 35 columns]"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "extracted = (\n",
+ " pd.concat([ extract_permutations(x).assign(comment_i = i)\n",
+ " for i, x in enumerate(sample_comments) ])\n",
+ " .set_index([\n",
+ " \"comment_i\",\n",
+ " \"seg_i\",\n",
+ " ])\n",
+ " [\"option\"]\n",
+ " .unstack()\n",
+ ")\n",
+ "\n",
+ "extracted.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To test for the independence of randomization, we calculate the correlation between any two segments in a comment:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " seg_a | \n",
+ " seg_b | \n",
+ " corr | \n",
+ " seg_int_a | \n",
+ " seg_int_b | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2508 | \n",
+ " 0_ | \n",
+ " 1_Hi, I'd like to comment on | \n",
+ " 0.035640 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2509 | \n",
+ " 0_Chairman Pai: | \n",
+ " 1_Hi, I'd like to comment on | \n",
+ " -0.043523 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2510 | \n",
+ " 0_Dear Chairman Pai, | \n",
+ " 1_Hi, I'd like to comment on | \n",
+ " -0.030419 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2511 | \n",
+ " 0_Dear Commissioners: | \n",
+ " 1_Hi, I'd like to comment on | \n",
+ " -0.040624 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2512 | \n",
+ " 0_Dear FCC, | \n",
+ " 1_Hi, I'd like to comment on | \n",
+ " -0.005508 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " seg_a seg_b corr \\\n",
+ "2508 0_ 1_Hi, I'd like to comment on 0.035640 \n",
+ "2509 0_Chairman Pai: 1_Hi, I'd like to comment on -0.043523 \n",
+ "2510 0_Dear Chairman Pai, 1_Hi, I'd like to comment on -0.030419 \n",
+ "2511 0_Dear Commissioners: 1_Hi, I'd like to comment on -0.040624 \n",
+ "2512 0_Dear FCC, 1_Hi, I'd like to comment on -0.005508 \n",
+ "\n",
+ " seg_int_a seg_int_b \n",
+ "2508 0 1 \n",
+ "2509 0 1 \n",
+ "2510 0 1 \n",
+ "2511 0 1 \n",
+ "2512 0 1 "
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "segment_correlations = (\n",
+ " # Turn each permutation into a dummy variable\n",
+ " extracted\n",
+ " .pipe(pd.get_dummies)\n",
+ " \n",
+ " # Calculate the correlations between them\n",
+ " .corr()\n",
+ " .reset_index()\n",
+ " .rename(columns = { \"index\": \"seg_a\" })\n",
+ " \n",
+ " # Melt the correlation matrix into a long/tidy DataFrame\n",
+ " .melt(\n",
+ " id_vars = [ \"seg_a\" ],\n",
+ " var_name = \"seg_b\",\n",
+ " value_name = \"corr\",\n",
+ " )\n",
+ " .assign(\n",
+ " seg_int_a = lambda df: df[\"seg_a\"].str.extract(r\"^(\\d+)\", expand = False).astype(int),\n",
+ " seg_int_b = lambda df: df[\"seg_b\"].str.extract(r\"^(\\d+)\", expand = False).astype(int),\n",
+ " )\n",
+ " \n",
+ " # Take only the first correlation (A•B instead of both A•B and B•A)\n",
+ " # and ignore self-correlations\n",
+ " .loc[lambda df: df[\"seg_a\"] < df[\"seg_b\"]]\n",
+ " \n",
+ " # Ignore correlations within the same segment, since they are mutually exclusive\n",
+ " .loc[lambda df: df[\"seg_int_a\"] != df[\"seg_int_b\"]]\n",
+ ")\n",
+ "\n",
+ "segment_correlations.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The output below demonstrates that are only a handful of pairs with a correlation above 0.15; they are all perfect correlations, meaning that the first segment choice guarantees the second. In this case, whatever is chosen for segments `13-19` is repeated for segments `39-45`. (Segments 14, 16, etc. are all fixed segments, and don't vary at all.)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " seg_a | \n",
+ " seg_b | \n",
+ " corr | \n",
+ " seg_int_a | \n",
+ " seg_int_b | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 29970 | \n",
+ " 13_Barack Obama's | \n",
+ " 39_Barack Obama's | \n",
+ " 1.0 | \n",
+ " 13 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " 30180 | \n",
+ " 13_Obama's | \n",
+ " 39_Obama's | \n",
+ " 1.0 | \n",
+ " 13 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " 30390 | \n",
+ " 13_President Obama's | \n",
+ " 39_President Obama's | \n",
+ " 1.0 | \n",
+ " 13 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " 30600 | \n",
+ " 13_The Obama/Wheeler | \n",
+ " 39_The Obama/Wheeler | \n",
+ " 1.0 | \n",
+ " 13 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " 30810 | \n",
+ " 13_The previous administration's | \n",
+ " 39_The previous administration's | \n",
+ " 1.0 | \n",
+ " 13 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " 31020 | \n",
+ " 13_Tom Wheeler's | \n",
+ " 39_Tom Wheeler's | \n",
+ " 1.0 | \n",
+ " 13 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " 31230 | \n",
+ " 15_decision | \n",
+ " 41_decision | \n",
+ " 1.0 | \n",
+ " 15 | \n",
+ " 41 | \n",
+ "
\n",
+ " \n",
+ " 31440 | \n",
+ " 15_order | \n",
+ " 41_order | \n",
+ " 1.0 | \n",
+ " 15 | \n",
+ " 41 | \n",
+ "
\n",
+ " \n",
+ " 31650 | \n",
+ " 15_plan | \n",
+ " 41_plan | \n",
+ " 1.0 | \n",
+ " 15 | \n",
+ " 41 | \n",
+ "
\n",
+ " \n",
+ " 31860 | \n",
+ " 15_policy | \n",
+ " 41_policy | \n",
+ " 1.0 | \n",
+ " 15 | \n",
+ " 41 | \n",
+ "
\n",
+ " \n",
+ " 32070 | \n",
+ " 15_power grab | \n",
+ " 41_power grab | \n",
+ " 1.0 | \n",
+ " 15 | \n",
+ " 41 | \n",
+ "
\n",
+ " \n",
+ " 32280 | \n",
+ " 15_scheme | \n",
+ " 41_scheme | \n",
+ " 1.0 | \n",
+ " 15 | \n",
+ " 41 | \n",
+ "
\n",
+ " \n",
+ " 32490 | \n",
+ " 17_control | \n",
+ " 43_control | \n",
+ " 1.0 | \n",
+ " 17 | \n",
+ " 43 | \n",
+ "
\n",
+ " \n",
+ " 32700 | \n",
+ " 17_regulate | \n",
+ " 43_regulate | \n",
+ " 1.0 | \n",
+ " 17 | \n",
+ " 43 | \n",
+ "
\n",
+ " \n",
+ " 32910 | \n",
+ " 17_take over | \n",
+ " 43_take over | \n",
+ " 1.0 | \n",
+ " 17 | \n",
+ " 43 | \n",
+ "
\n",
+ " \n",
+ " 33120 | \n",
+ " 19_Internet access | \n",
+ " 45_Internet access | \n",
+ " 1.0 | \n",
+ " 19 | \n",
+ " 45 | \n",
+ "
\n",
+ " \n",
+ " 33330 | \n",
+ " 19_broadband | \n",
+ " 45_broadband | \n",
+ " 1.0 | \n",
+ " 19 | \n",
+ " 45 | \n",
+ "
\n",
+ " \n",
+ " 33540 | \n",
+ " 19_the Internet | \n",
+ " 45_the Internet | \n",
+ " 1.0 | \n",
+ " 19 | \n",
+ " 45 | \n",
+ "
\n",
+ " \n",
+ " 33750 | \n",
+ " 19_the web | \n",
+ " 45_the web | \n",
+ " 1.0 | \n",
+ " 19 | \n",
+ " 45 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " seg_a seg_b \\\n",
+ "29970 13_Barack Obama's 39_Barack Obama's \n",
+ "30180 13_Obama's 39_Obama's \n",
+ "30390 13_President Obama's 39_President Obama's \n",
+ "30600 13_The Obama/Wheeler 39_The Obama/Wheeler \n",
+ "30810 13_The previous administration's 39_The previous administration's \n",
+ "31020 13_Tom Wheeler's 39_Tom Wheeler's \n",
+ "31230 15_decision 41_decision \n",
+ "31440 15_order 41_order \n",
+ "31650 15_plan 41_plan \n",
+ "31860 15_policy 41_policy \n",
+ "32070 15_power grab 41_power grab \n",
+ "32280 15_scheme 41_scheme \n",
+ "32490 17_control 43_control \n",
+ "32700 17_regulate 43_regulate \n",
+ "32910 17_take over 43_take over \n",
+ "33120 19_Internet access 45_Internet access \n",
+ "33330 19_broadband 45_broadband \n",
+ "33540 19_the Internet 45_the Internet \n",
+ "33750 19_the web 45_the web \n",
+ "\n",
+ " corr seg_int_a seg_int_b \n",
+ "29970 1.0 13 39 \n",
+ "30180 1.0 13 39 \n",
+ "30390 1.0 13 39 \n",
+ "30600 1.0 13 39 \n",
+ "30810 1.0 13 39 \n",
+ "31020 1.0 13 39 \n",
+ "31230 1.0 15 41 \n",
+ "31440 1.0 15 41 \n",
+ "31650 1.0 15 41 \n",
+ "31860 1.0 15 41 \n",
+ "32070 1.0 15 41 \n",
+ "32280 1.0 15 41 \n",
+ "32490 1.0 17 43 \n",
+ "32700 1.0 17 43 \n",
+ "32910 1.0 17 43 \n",
+ "33120 1.0 19 45 \n",
+ "33330 1.0 19 45 \n",
+ "33540 1.0 19 45 \n",
+ "33750 1.0 19 45 "
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " segment_correlations\n",
+ " .loc[lambda df: df[\"corr\"] > 0.15]\n",
+ " .sort_values(\"seg_a\")\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The output below demonstrates that no segment pairs with a correlation below -0.15, other than the possibilities inherently excluded by the perfect correlations above."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " seg_a | \n",
+ " seg_b | \n",
+ " corr | \n",
+ " seg_int_a | \n",
+ " seg_int_b | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 31015 | \n",
+ " 13_Barack Obama's | \n",
+ " 39_Tom Wheeler's | \n",
+ " -0.203366 | \n",
+ " 13 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " 30806 | \n",
+ " 13_Barack Obama's | \n",
+ " 39_The previous administration's | \n",
+ " -0.191720 | \n",
+ " 13 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " 30179 | \n",
+ " 13_Barack Obama's | \n",
+ " 39_Obama's | \n",
+ " -0.204086 | \n",
+ " 13 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " 30597 | \n",
+ " 13_Barack Obama's | \n",
+ " 39_The Obama/Wheeler | \n",
+ " -0.204086 | \n",
+ " 13 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " 30388 | \n",
+ " 13_Barack Obama's | \n",
+ " 39_President Obama's | \n",
+ " -0.200477 | \n",
+ " 13 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 33331 | \n",
+ " 19_the Internet | \n",
+ " 45_broadband | \n",
+ " -0.327781 | \n",
+ " 19 | \n",
+ " 45 | \n",
+ "
\n",
+ " \n",
+ " 33749 | \n",
+ " 19_the Internet | \n",
+ " 45_the web | \n",
+ " -0.337228 | \n",
+ " 19 | \n",
+ " 45 | \n",
+ "
\n",
+ " \n",
+ " 33123 | \n",
+ " 19_the web | \n",
+ " 45_Internet access | \n",
+ " -0.338160 | \n",
+ " 19 | \n",
+ " 45 | \n",
+ "
\n",
+ " \n",
+ " 33332 | \n",
+ " 19_the web | \n",
+ " 45_broadband | \n",
+ " -0.355864 | \n",
+ " 19 | \n",
+ " 45 | \n",
+ "
\n",
+ " \n",
+ " 33541 | \n",
+ " 19_the web | \n",
+ " 45_the Internet | \n",
+ " -0.337228 | \n",
+ " 19 | \n",
+ " 45 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
78 rows × 5 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " seg_a seg_b corr \\\n",
+ "31015 13_Barack Obama's 39_Tom Wheeler's -0.203366 \n",
+ "30806 13_Barack Obama's 39_The previous administration's -0.191720 \n",
+ "30179 13_Barack Obama's 39_Obama's -0.204086 \n",
+ "30597 13_Barack Obama's 39_The Obama/Wheeler -0.204086 \n",
+ "30388 13_Barack Obama's 39_President Obama's -0.200477 \n",
+ "... ... ... ... \n",
+ "33331 19_the Internet 45_broadband -0.327781 \n",
+ "33749 19_the Internet 45_the web -0.337228 \n",
+ "33123 19_the web 45_Internet access -0.338160 \n",
+ "33332 19_the web 45_broadband -0.355864 \n",
+ "33541 19_the web 45_the Internet -0.337228 \n",
+ "\n",
+ " seg_int_a seg_int_b \n",
+ "31015 13 39 \n",
+ "30806 13 39 \n",
+ "30179 13 39 \n",
+ "30597 13 39 \n",
+ "30388 13 39 \n",
+ "... ... ... \n",
+ "33331 19 45 \n",
+ "33749 19 45 \n",
+ "33123 19 45 \n",
+ "33332 19 45 \n",
+ "33541 19 45 \n",
+ "\n",
+ "[78 rows x 5 columns]"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " segment_correlations\n",
+ " .loc[lambda df: df[\"corr\"] < -0.15]\n",
+ " .sort_values(\"seg_a\")\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " seg_a | \n",
+ " seg_b | \n",
+ " corr | \n",
+ " seg_int_a | \n",
+ " seg_int_b | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [seg_a, seg_b, corr, seg_int_a, seg_int_b]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " segment_correlations\n",
+ " .loc[lambda df: df[\"corr\"] < -0.15]\n",
+ " .loc[lambda df: ~df[\"seg_int_a\"].isin([ 13, 15, 17, 19 ])]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Show the repeated segments"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Segments `13-19`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[\n",
+ " [\n",
+ " \"The previous administration's\",\n",
+ " \"The Obama/Wheeler\",\n",
+ " \"President Obama's\",\n",
+ " \"Barack Obama's\",\n",
+ " \"Tom Wheeler's\",\n",
+ " \"Obama's\"\n",
+ " ],\n",
+ " [\n",
+ " \" \"\n",
+ " ],\n",
+ " [\n",
+ " \"decision\",\n",
+ " \"scheme\",\n",
+ " \"policy\",\n",
+ " \"order\",\n",
+ " \"power grab\",\n",
+ " \"plan\"\n",
+ " ],\n",
+ " [\n",
+ " \" to \"\n",
+ " ],\n",
+ " [\n",
+ " \"regulate\",\n",
+ " \"control\",\n",
+ " \"take over\"\n",
+ " ],\n",
+ " [\n",
+ " \" \"\n",
+ " ],\n",
+ " [\n",
+ " \"broadband\",\n",
+ " \"the web\",\n",
+ " \"Internet access\",\n",
+ " \"the Internet\"\n",
+ " ]\n",
+ "]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(json.dumps(segments[13:20], indent = 2))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Segments `39-45`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[\n",
+ " [\n",
+ " \"The previous administration's\",\n",
+ " \"The Obama/Wheeler\",\n",
+ " \"President Obama's\",\n",
+ " \"Barack Obama's\",\n",
+ " \"Tom Wheeler's\",\n",
+ " \"Obama's\"\n",
+ " ],\n",
+ " [\n",
+ " \" \"\n",
+ " ],\n",
+ " [\n",
+ " \"decision\",\n",
+ " \"scheme\",\n",
+ " \"policy\",\n",
+ " \"order\",\n",
+ " \"power grab\",\n",
+ " \"plan\"\n",
+ " ],\n",
+ " [\n",
+ " \" to \"\n",
+ " ],\n",
+ " [\n",
+ " \"regulate\",\n",
+ " \"control\",\n",
+ " \"take over\"\n",
+ " ],\n",
+ " [\n",
+ " \" \"\n",
+ " ],\n",
+ " [\n",
+ " \"broadband\",\n",
+ " \"the web\",\n",
+ " \"Internet access\",\n",
+ " \"the Internet\"\n",
+ " ]\n",
+ "]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(json.dumps(segments[39:46], indent = 2))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Calculate possible permutations\n",
+ "\n",
+ "Below, we calculate the total possible permutations, with care to exclude the perfectly correlated segments (which we do by simply removing them from the calculation)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def calculate_permutations(segments):\n",
+ " count = reduce(lambda x, y: x * y, map(len, segments))\n",
+ " print(f\"Total permutations: {count:,d}\")\n",
+ " \n",
+ " log = math.log10(count)\n",
+ " print(f\"Log10: {log:.2f}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def remove_segments(segments, indices):\n",
+ " return [ s for i, s in enumerate(segments) if i not in indices ]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total permutations: 9,584,250,725,597,184,000,000\n",
+ "Log10: 21.98\n"
+ ]
+ }
+ ],
+ "source": [
+ "calculate_permutations(remove_segments(segments, [ 39, 41, 43, 45 ]))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "---\n",
+ "\n",
+ "---"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}