From 8f10873fa3e6d91bbb2396485d159a27c8e8e775 Mon Sep 17 00:00:00 2001
From: Jeremy Singer-Vine <jsvine@gmail.com>
Date: Thu, 3 Oct 2019 08:24:32 -0400
Subject: [PATCH] Initial commit

---
 .gitignore                                   |   63 +
 Pipfile                                      |   17 +
 Pipfile.lock                                 |  495 ++
 README.md                                    |  117 +
 data/.keep                                   |    0
 notebooks/analyze-fcc-comments.ipynb         | 5491 ++++++++++++++++++
 notebooks/analyze-mb-comment-structure.ipynb | 2267 ++++++++
 7 files changed, 8450 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Pipfile
 create mode 100644 Pipfile.lock
 create mode 100644 README.md
 create mode 100644 data/.keep
 create mode 100644 notebooks/analyze-fcc-comments.ipynb
 create mode 100644 notebooks/analyze-mb-comment-structure.ipynb

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d6e556f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,63 @@
+# Custom list:
+.ipynb_checkpoints
+.DS_Store
+
+#### joe made this: http://goel.io/joe
+
+#####=== Python ===#####
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..8778c61
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,17 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+
+[packages]
+requests = "*"
+pandas = "*"
+jupyter = "*"
+requests-cache = "*"
+tqdm = "*"
+nbexec = "*"
+
+[requires]
+python_version = "3.6"
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 0000000..3b9bf3c
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,495 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "682395d97dfd62d238e9bd70cf5d6cab49754a43ce3d5acac41efd94b6c1ac6e"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3.6"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "appnope": {
+            "hashes": [
+                "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0",
+                "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71"
+            ],
+            "markers": "sys_platform == 'darwin'",
+            "version": "==0.1.0"
+        },
+        "attrs": {
+            "hashes": [
+                "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79",
+                "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"
+            ],
+            "version": "==19.1.0"
+        },
+        "backcall": {
+            "hashes": [
+                "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
+                "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
+            ],
+            "version": "==0.1.0"
+        },
+        "bleach": {
+            "hashes": [
+                "sha256:213336e49e102af26d9cde77dd2d0397afabc5a6bf2fed985dc35b5d1e285a16",
+                "sha256:3fdf7f77adcf649c9911387df51254b813185e32b2c6619f690b593a617e19fa"
+            ],
+            "version": "==3.1.0"
+        },
+        "certifi": {
+            "hashes": [
+                "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50",
+                "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef"
+            ],
+            "version": "==2019.9.11"
+        },
+        "chardet": {
+            "hashes": [
+                "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
+                "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
+            ],
+            "version": "==3.0.4"
+        },
+        "decorator": {
+            "hashes": [
+                "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de",
+                "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6"
+            ],
+            "version": "==4.4.0"
+        },
+        "defusedxml": {
+            "hashes": [
+                "sha256:6687150770438374ab581bb7a1b327a847dd9c5749e396102de3fad4e8a3ef93",
+                "sha256:f684034d135af4c6cbb949b8a4d2ed61634515257a67299e5f940fbaa34377f5"
+            ],
+            "version": "==0.6.0"
+        },
+        "entrypoints": {
+            "hashes": [
+                "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19",
+                "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"
+            ],
+            "version": "==0.3"
+        },
+        "idna": {
+            "hashes": [
+                "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
+                "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
+            ],
+            "version": "==2.8"
+        },
+        "ipykernel": {
+            "hashes": [
+                "sha256:167c3ef08450f5e060b76c749905acb0e0fbef9365899377a4a1eae728864383",
+                "sha256:b503913e0b4cce7ed2de965457dfb2edd633e8234161a60e23f2fe2161345d12"
+            ],
+            "version": "==5.1.2"
+        },
+        "ipython": {
+            "hashes": [
+                "sha256:c4ab005921641e40a68e405e286e7a1fcc464497e14d81b6914b4fd95e5dee9b",
+                "sha256:dd76831f065f17bddd7eaa5c781f5ea32de5ef217592cf019e34043b56895aa1"
+            ],
+            "markers": "python_version >= '3.3'",
+            "version": "==7.8.0"
+        },
+        "ipython-genutils": {
+            "hashes": [
+                "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
+                "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
+            ],
+            "version": "==0.2.0"
+        },
+        "ipywidgets": {
+            "hashes": [
+                "sha256:13ffeca438e0c0f91ae583dc22f50379b9d6b28390ac7be8b757140e9a771516",
+                "sha256:e945f6e02854a74994c596d9db83444a1850c01648f1574adf144fbbabe05c97"
+            ],
+            "version": "==7.5.1"
+        },
+        "jedi": {
+            "hashes": [
+                "sha256:786b6c3d80e2f06fd77162a07fed81b8baa22dde5d62896a790a331d6ac21a27",
+                "sha256:ba859c74fa3c966a22f2aeebe1b74ee27e2a462f56d3f5f7ca4a59af61bfe42e"
+            ],
+            "version": "==0.15.1"
+        },
+        "jinja2": {
+            "hashes": [
+                "sha256:065c4f02ebe7f7cf559e49ee5a95fb800a9e4528727aec6f24402a5374c65013",
+                "sha256:14dd6caf1527abb21f08f86c784eac40853ba93edb79552aa1e4b8aef1b61c7b"
+            ],
+            "version": "==2.10.1"
+        },
+        "jsonschema": {
+            "hashes": [
+                "sha256:5f9c0a719ca2ce14c5de2fd350a64fd2d13e8539db29836a86adc990bb1a068f",
+                "sha256:8d4a2b7b6c2237e0199c8ea1a6d3e05bf118e289ae2b9d7ba444182a2959560d"
+            ],
+            "version": "==3.0.2"
+        },
+        "jupyter": {
+            "hashes": [
+                "sha256:3e1f86076bbb7c8c207829390305a2b1fe836d471ed54be66a3b8c41e7f46cc7",
+                "sha256:5b290f93b98ffbc21c0c7e749f054b3267782166d72fa5e3ed1ed4eaf34a2b78",
+                "sha256:d9dc4b3318f310e34c82951ea5d6683f67bed7def4b259fafbfe4f1beb1d8e5f"
+            ],
+            "index": "pypi",
+            "version": "==1.0.0"
+        },
+        "jupyter-client": {
+            "hashes": [
+                "sha256:73a809a2964afa07adcc1521537fddb58c2ffbb7e84d53dc5901cf80480465b3",
+                "sha256:98e8af5edff5d24e4d31e73bc21043130ae9d955a91aa93fc0bc3b1d0f7b5880"
+            ],
+            "version": "==5.3.1"
+        },
+        "jupyter-console": {
+            "hashes": [
+                "sha256:308ce876354924fb6c540b41d5d6d08acfc946984bf0c97777c1ddcb42e0b2f5",
+                "sha256:cc80a97a5c389cbd30252ffb5ce7cefd4b66bde98219edd16bf5cb6f84bb3568"
+            ],
+            "version": "==6.0.0"
+        },
+        "jupyter-core": {
+            "hashes": [
+                "sha256:2c6e7c1e9f2ac45b5c2ceea5730bc9008d92fe59d0725eac57b04c0edfba24f7",
+                "sha256:f4fa22d6cf25f34807c995f22d2923693575c70f02557bcbfbe59bd5ec8d8b84"
+            ],
+            "version": "==4.5.0"
+        },
+        "markupsafe": {
+            "hashes": [
+                "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473",
+                "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161",
+                "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
+                "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
+                "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
+                "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
+                "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
+                "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
+                "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
+                "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66",
+                "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1",
+                "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
+                "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
+                "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
+                "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905",
+                "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735",
+                "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d",
+                "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e",
+                "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d",
+                "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c",
+                "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21",
+                "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2",
+                "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5",
+                "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b",
+                "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6",
+                "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f",
+                "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f",
+                "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7"
+            ],
+            "version": "==1.1.1"
+        },
+        "mistune": {
+            "hashes": [
+                "sha256:59a3429db53c50b5c6bcc8a07f8848cb00d7dc8bdb431a4ab41920d201d4756e",
+                "sha256:88a1051873018da288eee8538d476dffe1262495144b33ecb586c4ab266bb8d4"
+            ],
+            "version": "==0.8.4"
+        },
+        "nbconvert": {
+            "hashes": [
+                "sha256:427a468ec26e7d68a529b95f578d5cbf018cb4c1f889e897681c2b6d11897695",
+                "sha256:48d3c342057a2cf21e8df820d49ff27ab9f25fc72b8f15606bd47967333b2709"
+            ],
+            "version": "==5.6.0"
+        },
+        "nbexec": {
+            "hashes": [
+                "sha256:e367bac4a5c7cbd12e5b73b5e1011a4227b11d9e8e5a3811d8b8b9987eb235d0"
+            ],
+            "index": "pypi",
+            "version": "==0.0.2"
+        },
+        "nbformat": {
+            "hashes": [
+                "sha256:b9a0dbdbd45bb034f4f8893cafd6f652ea08c8c1674ba83f2dc55d3955743b0b",
+                "sha256:f7494ef0df60766b7cabe0a3651556345a963b74dbc16bc7c18479041170d402"
+            ],
+            "version": "==4.4.0"
+        },
+        "notebook": {
+            "hashes": [
+                "sha256:660976fe4fe45c7aa55e04bf4bccb9f9566749ff637e9020af3422f9921f9a5d",
+                "sha256:b0a290f5cc7792d50a21bec62b3c221dd820bf00efa916ce9aeec4b5354bde20"
+            ],
+            "version": "==6.0.1"
+        },
+        "numpy": {
+            "hashes": [
+                "sha256:05dbfe72684cc14b92568de1bc1f41e5f62b00f714afc9adee42f6311738091f",
+                "sha256:0d82cb7271a577529d07bbb05cb58675f2deb09772175fab96dc8de025d8ac05",
+                "sha256:10132aa1fef99adc85a905d82e8497a580f83739837d7cbd234649f2e9b9dc58",
+                "sha256:12322df2e21f033a60c80319c25011194cd2a21294cc66fee0908aeae2c27832",
+                "sha256:16f19b3aa775dddc9814e02a46b8e6ae6a54ed8cf143962b4e53f0471dbd7b16",
+                "sha256:3d0b0989dd2d066db006158de7220802899a1e5c8cf622abe2d0bd158fd01c2c",
+                "sha256:438a3f0e7b681642898fd7993d38e2bf140a2d1eafaf3e89bb626db7f50db355",
+                "sha256:5fd214f482ab53f2cea57414c5fb3e58895b17df6e6f5bca5be6a0bb6aea23bb",
+                "sha256:73615d3edc84dd7c4aeb212fa3748fb83217e00d201875a47327f55363cef2df",
+                "sha256:7bd355ad7496f4ce1d235e9814ec81ee3d28308d591c067ce92e49f745ba2c2f",
+                "sha256:7d077f2976b8f3de08a0dcf5d72083f4af5411e8fddacd662aae27baa2601196",
+                "sha256:a4092682778dc48093e8bda8d26ee8360153e2047826f95a3f5eae09f0ae3abf",
+                "sha256:b458de8624c9f6034af492372eb2fee41a8e605f03f4732f43fc099e227858b2",
+                "sha256:e70fc8ff03a961f13363c2c95ef8285e0cf6a720f8271836f852cc0fa64e97c8",
+                "sha256:ee8e9d7cad5fe6dde50ede0d2e978d81eafeaa6233fb0b8719f60214cf226578",
+                "sha256:f4a4f6aba148858a5a5d546a99280f71f5ee6ec8182a7d195af1a914195b21a2"
+            ],
+            "version": "==1.17.2"
+        },
+        "pandas": {
+            "hashes": [
+                "sha256:18d91a9199d1dfaa01ad645f7540370ba630bdcef09daaf9edf45b4b1bca0232",
+                "sha256:3f26e5da310a0c0b83ea50da1fd397de2640b02b424aa69be7e0784228f656c9",
+                "sha256:4182e32f4456d2c64619e97c58571fa5ca0993d1e8c2d9ca44916185e1726e15",
+                "sha256:426e590e2eb0e60f765271d668a30cf38b582eaae5ec9b31229c8c3c10c5bc21",
+                "sha256:5eb934a8f0dc358f0e0cdf314072286bbac74e4c124b64371395e94644d5d919",
+                "sha256:717928808043d3ea55b9bcde636d4a52d2236c246f6df464163a66ff59980ad8",
+                "sha256:8145f97c5ed71827a6ec98ceaef35afed1377e2d19c4078f324d209ff253ecb5",
+                "sha256:8744c84c914dcc59cbbb2943b32b7664df1039d99e834e1034a3372acb89ea4d",
+                "sha256:c1ac1d9590d0c9314ebf01591bd40d4c03d710bfc84a3889e5263c97d7891dee",
+                "sha256:cb2e197b7b0687becb026b84d3c242482f20cbb29a9981e43604eb67576da9f6",
+                "sha256:d4001b71ad2c9b84ff18b182cea22b7b6cbf624216da3ea06fb7af28d1f93165",
+                "sha256:d8930772adccb2882989ab1493fa74bd87d47c8ac7417f5dd3dd834ba8c24dc9",
+                "sha256:dfbb0173ee2399bc4ed3caf2d236e5c0092f948aafd0a15fbe4a0e77ee61a958",
+                "sha256:eebfbba048f4fa8ac711b22c78516e16ff8117d05a580e7eeef6b0c2be554c18",
+                "sha256:f1b21bc5cf3dbea53d33615d1ead892dfdae9d7052fa8898083bec88be20dcd2"
+            ],
+            "index": "pypi",
+            "version": "==0.25.1"
+        },
+        "pandocfilters": {
+            "hashes": [
+                "sha256:b3dd70e169bb5449e6bc6ff96aea89c5eea8c5f6ab5e207fc2f521a2cf4a0da9"
+            ],
+            "version": "==1.4.2"
+        },
+        "parso": {
+            "hashes": [
+                "sha256:63854233e1fadb5da97f2744b6b24346d2750b85965e7e399bec1620232797dc",
+                "sha256:666b0ee4a7a1220f65d367617f2cd3ffddff3e205f3f16a0284df30e774c2a9c"
+            ],
+            "version": "==0.5.1"
+        },
+        "pexpect": {
+            "hashes": [
+                "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1",
+                "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb"
+            ],
+            "markers": "sys_platform != 'win32'",
+            "version": "==4.7.0"
+        },
+        "pickleshare": {
+            "hashes": [
+                "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca",
+                "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"
+            ],
+            "version": "==0.7.5"
+        },
+        "prometheus-client": {
+            "hashes": [
+                "sha256:71cd24a2b3eb335cb800c7159f423df1bd4dcd5171b234be15e3f31ec9f622da"
+            ],
+            "version": "==0.7.1"
+        },
+        "prompt-toolkit": {
+            "hashes": [
+                "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780",
+                "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1",
+                "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55"
+            ],
+            "version": "==2.0.9"
+        },
+        "ptyprocess": {
+            "hashes": [
+                "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
+                "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
+            ],
+            "markers": "os_name != 'nt'",
+            "version": "==0.6.0"
+        },
+        "pygments": {
+            "hashes": [
+                "sha256:71e430bc85c88a430f000ac1d9b331d2407f681d6f6aec95e8bcfbc3df5b0127",
+                "sha256:881c4c157e45f30af185c1ffe8d549d48ac9127433f2c380c24b84572ad66297"
+            ],
+            "version": "==2.4.2"
+        },
+        "pyrsistent": {
+            "hashes": [
+                "sha256:34b47fa169d6006b32e99d4b3c4031f155e6e68ebcc107d6454852e8e0ee6533"
+            ],
+            "version": "==0.15.4"
+        },
+        "python-dateutil": {
+            "hashes": [
+                "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb",
+                "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e"
+            ],
+            "version": "==2.8.0"
+        },
+        "pytz": {
+            "hashes": [
+                "sha256:26c0b32e437e54a18161324a2fca3c4b9846b74a8dccddd843113109e1116b32",
+                "sha256:c894d57500a4cd2d5c71114aaab77dbab5eabd9022308ce5ac9bb93a60a6f0c7"
+            ],
+            "version": "==2019.2"
+        },
+        "pyzmq": {
+            "hashes": [
+                "sha256:01636e95a88d60118479041c6aaaaf5419c6485b7b1d37c9c4dd424b7b9f1121",
+                "sha256:021dba0d1436516092c624359e5da51472b11ba8edffa334218912f7e8b65467",
+                "sha256:0463bd941b6aead494d4035f7eebd70035293dd6caf8425993e85ad41de13fa3",
+                "sha256:05fd51edd81eed798fccafdd49c936b6c166ffae7b32482e4d6d6a2e196af4e6",
+                "sha256:1fadc8fbdf3d22753c36d4172169d184ee6654f8d6539e7af25029643363c490",
+                "sha256:22efa0596cf245a78a99060fe5682c4cd00c58bb7614271129215c889062db80",
+                "sha256:260c70b7c018905ec3659d0f04db735ac830fe27236e43b9dc0532cf7c9873ef",
+                "sha256:2762c45e289732d4450406cedca35a9d4d71e449131ba2f491e0bf473e3d2ff2",
+                "sha256:2fc6cada8dc53521c1189596f1898d45c5f68603194d3a6453d6db4b27f4e12e",
+                "sha256:343b9710a61f2b167673bea1974e70b5dccfe64b5ed10626798f08c1f7227e72",
+                "sha256:41bf96d5f554598a0632c3ec28e3026f1d6591a50f580df38eff0b8067efb9e7",
+                "sha256:856b2cdf7a1e2cbb84928e1e8db0ea4018709b39804103d3a409e5584f553f57",
+                "sha256:85b869abc894672de9aecdf032158ea8ad01e2f0c3b09ef60e3687fb79418096",
+                "sha256:93f44739db69234c013a16990e43db1aa0af3cf5a4b8b377d028ff24515fbeb3",
+                "sha256:98fa3e75ccb22c0dc99654e3dd9ff693b956861459e8c8e8734dd6247b89eb29",
+                "sha256:9a22c94d2e93af8bebd4fcf5fa38830f5e3b1ff0d4424e2912b07651eb1bafb4",
+                "sha256:a7d3f4b4bbb5d7866ae727763268b5c15797cbd7b63ea17f3b0ec1067da8994b",
+                "sha256:b645a49376547b3816433a7e2d2a99135c8e651e50497e7ecac3bd126e4bea16",
+                "sha256:cf0765822e78cf9e45451647a346d443f66792aba906bc340f4e0ac7870c169c",
+                "sha256:dc398e1e047efb18bfab7a8989346c6921a847feae2cad69fedf6ca12fb99e2c",
+                "sha256:dd5995ae2e80044e33b5077fb4bc2b0c1788ac6feaf15a6b87a00c14b4bdd682",
+                "sha256:e03fe5e07e70f245dc9013a9d48ae8cc4b10c33a1968039c5a3b64b5d01d083d",
+                "sha256:ea09a306144dff2795e48439883349819bef2c53c0ee62a3c2fae429451843bb",
+                "sha256:f4e37f33da282c3c319849877e34f97f0a3acec09622ec61b7333205bdd13b52",
+                "sha256:fa4bad0d1d173dee3e8ef3c3eb6b2bb6c723fc7a661eeecc1ecb2fa99860dd45"
+            ],
+            "version": "==18.1.0"
+        },
+        "qtconsole": {
+            "hashes": [
+                "sha256:40d5d8e00d070ea266dbf6f0da74c4b9597b8b8d67cd8233c3ffd8debf923703",
+                "sha256:b91e7412587e6cfe1644696538f73baf5611e837be5406633218443b2827c6d9"
+            ],
+            "version": "==4.5.5"
+        },
+        "requests": {
+            "hashes": [
+                "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
+                "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
+            ],
+            "index": "pypi",
+            "version": "==2.22.0"
+        },
+        "requests-cache": {
+            "hashes": [
+                "sha256:813023269686045f8e01e2289cc1e7e9ae5ab22ddd1e2849a9093ab3ab7270eb",
+                "sha256:81e13559baee64677a7d73b85498a5a8f0639e204517b5d05ff378e44a57831a"
+            ],
+            "index": "pypi",
+            "version": "==0.5.2"
+        },
+        "send2trash": {
+            "hashes": [
+                "sha256:60001cc07d707fe247c94f74ca6ac0d3255aabcb930529690897ca2a39db28b2",
+                "sha256:f1691922577b6fa12821234aeb57599d887c4900b9ca537948d2dac34aea888b"
+            ],
+            "version": "==1.5.0"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        },
+        "terminado": {
+            "hashes": [
+                "sha256:d9d012de63acb8223ac969c17c3043337c2fcfd28f3aea1ee429b345d01ef460",
+                "sha256:de08e141f83c3a0798b050ecb097ab6259c3f0331b2f7b7750c9075ced2c20c2"
+            ],
+            "version": "==0.8.2"
+        },
+        "testpath": {
+            "hashes": [
+                "sha256:46c89ebb683f473ffe2aab0ed9f12581d4d078308a3cb3765d79c6b2317b0109",
+                "sha256:b694b3d9288dbd81685c5d2e7140b81365d46c29f5db4bc659de5aa6b98780f8"
+            ],
+            "version": "==0.4.2"
+        },
+        "tornado": {
+            "hashes": [
+                "sha256:349884248c36801afa19e342a77cc4458caca694b0eda633f5878e458a44cb2c",
+                "sha256:398e0d35e086ba38a0427c3b37f4337327231942e731edaa6e9fd1865bbd6f60",
+                "sha256:4e73ef678b1a859f0cb29e1d895526a20ea64b5ffd510a2307b5998c7df24281",
+                "sha256:559bce3d31484b665259f50cd94c5c28b961b09315ccd838f284687245f416e5",
+                "sha256:abbe53a39734ef4aba061fca54e30c6b4639d3e1f59653f0da37a0003de148c7",
+                "sha256:c845db36ba616912074c5b1ee897f8e0124df269468f25e4fe21fe72f6edd7a9",
+                "sha256:c9399267c926a4e7c418baa5cbe91c7d1cf362d505a1ef898fde44a07c9dd8a5"
+            ],
+            "version": "==6.0.3"
+        },
+        "tqdm": {
+            "hashes": [
+                "sha256:1be3e4e3198f2d0e47b928e9d9a8ec1b63525db29095cec1467f4c5a4ea8ebf9",
+                "sha256:7e39a30e3d34a7a6539378e39d7490326253b7ee354878a92255656dc4284457"
+            ],
+            "index": "pypi",
+            "version": "==4.35.0"
+        },
+        "traitlets": {
+            "hashes": [
+                "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
+                "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
+            ],
+            "version": "==4.3.2"
+        },
+        "urllib3": {
+            "hashes": [
+                "sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1",
+                "sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232"
+            ],
+            "version": "==1.25.3"
+        },
+        "wcwidth": {
+            "hashes": [
+                "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+                "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+            ],
+            "version": "==0.1.7"
+        },
+        "webencodings": {
+            "hashes": [
+                "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78",
+                "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"
+            ],
+            "version": "==0.5.1"
+        },
+        "widgetsnbextension": {
+            "hashes": [
+                "sha256:079f87d87270bce047512400efd70238820751a11d2d8cb137a5a5bdbaf255c7",
+                "sha256:bd314f8ceb488571a5ffea6cc5b9fc6cba0adaf88a9d2386b93a489751938bcd"
+            ],
+            "version": "==3.5.1"
+        }
+    },
+    "develop": {}
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d59aabe
--- /dev/null
+++ b/README.md
@@ -0,0 +1,117 @@
+# Analysis of comments submitted to three FCC public dockets
+
+This repository contains data, code, and methodology supporting [BuzzFeed News' analysis of comments submitted to three Federal Communications Commission (FCC) dockets](https://www.buzzfeednews.com/article/jsvine/net-neutrality-fcc-fake-comments-impersonation), published October 3, 2019:
+
+- 17-108 ("Restoring Internet Freedom")
+- 16-42 ("Expanding Consumers' Video Navigation Choices")
+- 14-28 ("Protecting and Promoting the Open Internet")
+
+Please see below for further details.
+
+## Data Sources
+
+The data in this repository comes from several sources:
+
+### The FCC's Electronic Comment Filing System (ECFS)
+
+The ECFS is the FCC's public portal for searching and accessing comments submitted to the commission's dockets. BuzzFeed News used the website to download each individually-listed comment, for two of the dockets: [14-28](https://www.fcc.gov/ecfs/search/filings?date_disseminated=%5Bgte%5D2014-02-21%5Blte%5D2016-01-01&proceedings_name=14-28&sort=date_disseminated,ASC_description=COMMENT) and [16-42](https://www.fcc.gov/ecfs/search/filings?date_disseminated=%5Bgte%5D2016-02-23%5Blte%5D2018-10-01&proceedings_name=16-42&sort=date_disseminated,ASC&submissiontype_description=COMMENT). __Note__: Not all comments submitted to the FCC are individually listed; in some cases, an organization will submit a consolidated set of comments as a PDF, with signatures and/or commenters' information listed in that PDF. Because of the extraordinary variety and inconsistency of those files, BuzzFeed News did not disaggregate those comments.
+
+### The FCC's bulk download of Docket 17-108 comments
+
+On November 7, 2017, [the FCC released](https://ecfsapi.fcc.gov/file/11073095518421/DA-17-1089A1_Rcd.pdf) a "complete set of [Docket 17-108] filings submitted as of November 3, 2017"; BuzzFeed News used this download to examine docket-wide trends.
+
+### Bulk uploads to Docket 17-108, via FOIA
+
+In response to two FOIA requests, the FCC provided to BuzzFeed News the files submitted to the agency's [bulk-upload system for Docket 17-108](https://www.fcc.gov/restoring-internet-freedom-comments-wc-docket-no-17-108), plus associated metadata indicating the uploader's Box.com account and the time of the upload. According to the FCC, it provided all such files submitted. Although the agency provided a template for the uploads, some of the files — typically the smallest ones, containing just one comment each — do not conform to them and could not be incorporated easily. Those comments, which represent an exceedingly small percentage of all bulk-uploaded comments, have not been included in this repository's data; in many cases, the corresponding comments appear also not to have been added to the FCC's public comment portal. In certain other cases, the upload files use non-standard column names. In cases where the intention appeared to be clear, BuzzFeed News fixed the column names and included the data.
+
+### haveibeenpwned.com
+
+[Have I Been Pwned](https://haveibeenpwned.com/) is a website and service that identifies whether any given email address has been exposed in any of hundreds of major data breaches. BuzzFeed News used [HIBP's application programming interface](https://haveibeenpwned.com/API/v3) to determine the most common breaches associated with various groups of email addresses.
+
+## Personal Information Minimization
+
+Because it appears that many of the comments in the data above were submitted without the consent of the named commenters, we have taken the following steps:
+
+- Removing all raw personal-information columns (name, physical address, etc.).
+
+- Replacing each distinct email address with a randomly-assigned unique identifier. (Specifically, a [version 4 UUID](https://www.cryptosys.net/pki/uuid-rfc4122.html).)
+
+- Replacing each distinct email domain with a similar randomly-assigned unique identifier, except for very common domains. (Specifically the 36 domains that are associated with 10,000 or more unique email addresses in the Docket 17-108 comments.)
+
+- Replacing each distinct combination of name + location (first line of street address, city, state, ZIP code) with another UUID. Before converting to UUIDs, ZIP codes are converted to zero-padded five-digit representations, and all strings are lowercased. For instance: `John Doe, 123 Smith Street, New York, NY 01111` will receive the same UUID as `john doe, 123 SMITH STREET, New York, ny 1111`, but neither will match submissions that put him at `123 Smith St.` (with the abbreviation).
+
+## Data Files
+
+The process above produces the files listed below. Several are too large to host on GitHub, so BuzzFeed News has [uploaded them here](https://archive.org/details/fcc-comments-and-bulk-uploads).
+
+### Comment data
+
+These files contain selected fields from the comment data listed above:
+
+- `bulk-uploads-17-108-with-uuids.csv`: Docket 17-108 bulk uploads, via FOIA
+- `comments-17-108-with-uuids.csv`: Docket 17-108, via FCC official download
+- `comments-14-28-with-uuids.csv`: Docket 14-28, via FCC online portal
+- `comments-16-42-with-uuids.csv`: Docket 16-42, via FCC online portal
+
+They contain the following columns:
+
+- `date`: The date of submission.
+- `id_submission`: The ID the FCC has assigned to the comment. __Note__: Not available in `bulk-uploads-17-108-with-uuids.csv`, because the FCC assigns the IDs *after* they are uploaded.
+- `comments`: The text of the comment. __Note__: This is sometimes modified by the FCC, for example by adding a filename or, as appears to be the case for some Docket 14-28 comments, removing boilerplate language.) __Note__: Not included in `comments-17-108-with-uuids.csv` for file-size considerations, because this file is mainly used for domain-counts.
+- `name_and_location`: The UUID (see above) corresponding to the name and adress information provided with the comment. __Note__: Not included in `comments-17-108-with-uuids.csv`.
+- `email_address`: The UUID (see above) corresponding to the email address provided with the comment. __Note__: In the FCC's commenting system, you don't have to control an email address to list it as the author of a comment.
+- `email_address_nonstandard`: If the email address contains nonstandard characters (such as `%`) or formatting (such as lacking an `@` symbol), this value will be `1`; otherwise, it will be `0`. This is used to filter out likely-invalid addresses before checking them on Have I Been Pwned.
+- `email_domain`: The domain of the email address, as a UUID unless it is one of the 36 domains described above.
+
+Additionally, `bulk-uploads-17-108-with-uuids.csv` contains the following columns:
+
+- `file`: The name of the file in which the comment was uploaded.
+- `uploader`: The email address associated with the Box.com account that uploaded the file.
+
+### Breach data
+
+These files list the breaches, per Have I Been Pwned, for email addresses in a randomized samples of the comments bulk-uplaoded to Docket 17-108:
+
+- `breaches-17-108-bulk-uploads-sample.csv`: 1,000-address sample of each of the eight bulk-uploaders whose Docket 17-108 uploads contained at least 10,000 unique email addresses.
+- `breaches-17-108-mb-sample.csv`: 10,000-address sample of Media Bridge's Docket 17-108 bulk-uploads.
+
+
+They contain the following columns:
+
+- `email_address`: The UUID (see above) corresponding to the email address examined.
+- `breach`: The name of the breach, [as returned by Have I Been Pwned](https://haveibeenpwned.com/API/v3).
+
+## Analysis
+
+The [`analyze-fcc-comments` notebook](notebooks/analyze-fcc-comments.ipynb) examines comments submitted to the three FCC dockets described above, the language used in them, the timing of their submission. For Docket 17-108, the notebook also examines the email domains associated with the comments, as well as rates at which the email addresses in the bulk uploads overlap with those exposed in major data breaches. The notebook also examines the overlap between the contact information in Docket 16-42 and Docket 17-108.
+
+The [`analyze-mb-comment-structure` notebook](notebooks/analyze-mb-comment-structure.ipynb) examines the phrasing of the comments that Media Bridge submitted to Docket 17-108, and attempts to reverse-engineer the comments that use randomly-generated text.
+
+## Reproducibility
+
+The code running the analysis is written in Python 3, and requires the following Python libraries:
+
+- [jupyter](https://jupyter.org/) to run the notebook infrastructure
+- [pandas](https://pandas.pydata.org/) for data loading and analysis
+
+If you would like to reuse the code for fetching data from Have I Been Pwned's API, you will also need these Python libraries:
+
+- [requests](https://2.python-requests.org/en/master/) for HTTP requests
+- [requests-cache](https://requests-cache.readthedocs.io/en/latest/) for caching HTTP requests
+- [tqdm](https://tqdm.github.io) for progress bars
+
+If you use Pipenv, you can install all required libraries with `pipenv install`.
+
+As noted above, you will need to download the source data separately. Save the folder as this repository's `/data` directory.
+
+Execute the notebooks in the `notebooks/` directory to reproduce the findings.
+
+## Licensing
+
+All code in this repository is available under the [MIT License](https://opensource.org/licenses/MIT).
+
+## Questions / Feedback
+
+Contact Jeremy Singer-Vine at [jeremy.singer-vine@buzzfeed.com](mailto:jeremy.singer-vine@buzzfeed.com).
+
+Looking for more from BuzzFeed News? [Click here for a list of our open-sourced projects, data, and code.](https://github.com/BuzzFeedNews/everything)
diff --git a/data/.keep b/data/.keep
new file mode 100644
index 0000000..e69de29
diff --git a/notebooks/analyze-fcc-comments.ipynb b/notebooks/analyze-fcc-comments.ipynb
new file mode 100644
index 0000000..b51f557
--- /dev/null
+++ b/notebooks/analyze-fcc-comments.ipynb
@@ -0,0 +1,5491 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analysis of comments in three FCC dockets\n",
+    "\n",
+    "This notebook contains Python code that runs the following steps:\n",
+    "\n",
+    "- Loading the four comment datasets under analysis (published comments for FCC dockets 14-28, 16-42, 17-108, plus bulk-uploaded comments for docket 17-108).\n",
+    "\n",
+    "\n",
+    "- Classifying the comments for dockets 14-28 and 16-42, based on the language used in them.\n",
+    "\n",
+    "\n",
+    "- Examining:\n",
+    "\n",
+    "    - How often email addresses the 17-108 bulk uploads appear in data breaches identified by [Have I Been Pwned](https://haveibeenpwned.com/)\n",
+    "\n",
+    "    - The overlap between comments in docket 16-42 and bulk-uploaded comments in docket 17-108\n",
+    "\n",
+    "    - The comments attributed to Annie Reeves vis-a-vis the timing and language used in American Commitment's docket 14-28 and docket 16-42 mass-comment campaigns.\n",
+    "    \n",
+    "__Please see this repository's landing page and associated BuzzFeed News article (linked on the landing page) for context before continuing.__"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Import Python libraries and set key variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Standard libraries\n",
+    "import os\n",
+    "import sys\n",
+    "import time\n",
+    "import re\n",
+    "\n",
+    "# External libraries\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Change this to True if you plan to reuse this notebook\n",
+    "# and want to make HTTP requests to Have I Been Pwned's API\n",
+    "\n",
+    "MAKE_HTTP_REQUESTS = False\n",
+    "\n",
+    "if MAKE_HTTP_REQUESTS:\n",
+    "    from tqdm.auto import tqdm\n",
+    "    import requests\n",
+    "    import requests_cache\n",
+    "    \n",
+    "    # This is the API key for Have I Been Pwned\n",
+    "    HIBP_KEY = open(\"../hibp-key.txt\").read().strip()\n",
+    "    \n",
+    "    # Enables graphical progress bars when fetching HIPB data\n",
+    "    tqdm.pandas()\n",
+    "    \n",
+    "    # For caching HTTP requests\n",
+    "    requests_cache.install_cache(\n",
+    "        \"../hibp-requests-cache\",\n",
+    "        allowable_codes = (200, 404),\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BASE_PATH = \"../data/\"\n",
+    "\n",
+    "# In the sampling procedures below, we use this \"random state\"\n",
+    "# to make the samples reproducible. \n",
+    "RANDOM_STATE = 0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load comments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_comments(path, **kwargs):\n",
+    "    return (\n",
+    "        pd.read_csv(\n",
+    "            path,\n",
+    "            dtype = str,\n",
+    "            **kwargs\n",
+    "        )\n",
+    "        .astype({\n",
+    "            \"email_address_nonstandard\": int\n",
+    "        })\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Docket 17-108, bulk uploads"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>date</th>\n",
+       "      <th>comments</th>\n",
+       "      <th>file</th>\n",
+       "      <th>uploader</th>\n",
+       "      <th>email_address_nonstandard</th>\n",
+       "      <th>email_address</th>\n",
+       "      <th>email_domain</th>\n",
+       "      <th>name_and_location</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>5/8/2017</td>\n",
+       "      <td>Dear FCC, I am am writing today to SUPPORT net...</td>\n",
+       "      <td>ecfs-input-template-17-108 (209).csv</td>\n",
+       "      <td>kathleenkintz@gmail.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>9f664e24-96aa-4d96-b453-24d926658b47</td>\n",
+       "      <td>gmail.com</td>\n",
+       "      <td>5100f64f-b025-467f-9aa6-0100fa615ae6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>12/31/2017</td>\n",
+       "      <td>Dear FCC, I am writing you today because I spe...</td>\n",
+       "      <td>ecfs-input-template-17-108 (120).csv</td>\n",
+       "      <td>vgboy522@gmail.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>818761bf-4c51-4970-95e6-11b01bac631f</td>\n",
+       "      <td>gmail.com</td>\n",
+       "      <td>dda3bd6b-9ad2-42d0-af15-f12c0b8a9354</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>5/16/17</td>\n",
+       "      <td>Obama's Federal Communications Commission (FCC...</td>\n",
+       "      <td>TPA_3911_2017526.csv</td>\n",
+       "      <td>esmisc@mac.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>f2cf802f-0c01-4d1f-b28f-0efef2a053ba</td>\n",
+       "      <td>hotmail.com</td>\n",
+       "      <td>d9b96c36-796e-45d1-97d8-00647ae09d89</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>5/16/17</td>\n",
+       "      <td>Obama's Federal Communications Commission (FCC...</td>\n",
+       "      <td>TPA_3911_2017526.csv</td>\n",
+       "      <td>esmisc@mac.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>6966ae39-6da6-4a47-a1ec-7dc854030634</td>\n",
+       "      <td>gmail.com</td>\n",
+       "      <td>f6d75f39-e952-41ff-b7a9-3d86da811496</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>5/16/17</td>\n",
+       "      <td>Obama's Federal Communications Commission (FCC...</td>\n",
+       "      <td>TPA_3911_2017526.csv</td>\n",
+       "      <td>esmisc@mac.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>610afa24-f0df-44ff-b621-f545d371efab</td>\n",
+       "      <td>gmail.com</td>\n",
+       "      <td>1b3050d5-6f3a-495e-a67f-b3b61040fe02</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         date                                           comments  \\\n",
+       "0    5/8/2017  Dear FCC, I am am writing today to SUPPORT net...   \n",
+       "1  12/31/2017  Dear FCC, I am writing you today because I spe...   \n",
+       "2     5/16/17  Obama's Federal Communications Commission (FCC...   \n",
+       "3     5/16/17  Obama's Federal Communications Commission (FCC...   \n",
+       "4     5/16/17  Obama's Federal Communications Commission (FCC...   \n",
+       "\n",
+       "                                   file                 uploader  \\\n",
+       "0  ecfs-input-template-17-108 (209).csv  kathleenkintz@gmail.com   \n",
+       "1  ecfs-input-template-17-108 (120).csv       vgboy522@gmail.com   \n",
+       "2                  TPA_3911_2017526.csv           esmisc@mac.com   \n",
+       "3                  TPA_3911_2017526.csv           esmisc@mac.com   \n",
+       "4                  TPA_3911_2017526.csv           esmisc@mac.com   \n",
+       "\n",
+       "   email_address_nonstandard                         email_address  \\\n",
+       "0                          0  9f664e24-96aa-4d96-b453-24d926658b47   \n",
+       "1                          0  818761bf-4c51-4970-95e6-11b01bac631f   \n",
+       "2                          0  f2cf802f-0c01-4d1f-b28f-0efef2a053ba   \n",
+       "3                          0  6966ae39-6da6-4a47-a1ec-7dc854030634   \n",
+       "4                          0  610afa24-f0df-44ff-b621-f545d371efab   \n",
+       "\n",
+       "  email_domain                     name_and_location  \n",
+       "0    gmail.com  5100f64f-b025-467f-9aa6-0100fa615ae6  \n",
+       "1    gmail.com  dda3bd6b-9ad2-42d0-af15-f12c0b8a9354  \n",
+       "2  hotmail.com  d9b96c36-796e-45d1-97d8-00647ae09d89  \n",
+       "3    gmail.com  f6d75f39-e952-41ff-b7a9-3d86da811496  \n",
+       "4    gmail.com  1b3050d5-6f3a-495e-a67f-b3b61040fe02  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "bulk_uploads_17_108 = load_comments(BASE_PATH + \"bulk-uploads-17-108-with-uuids.csv\")\n",
+    "\n",
+    "bulk_uploads_17_108.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Examine bulk-uploader metrics for 17-108"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>submissions</th>\n",
+       "      <th>unique_emails</th>\n",
+       "      <th>prop_with_email</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>uploader</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>esmisc@mac.com</td>\n",
+       "      <td>4347979</td>\n",
+       "      <td>3966016</td>\n",
+       "      <td>1.0000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>1856553</td>\n",
+       "      <td>1501145</td>\n",
+       "      <td>1.0000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>mike@fightforthefuture.org</td>\n",
+       "      <td>1464423</td>\n",
+       "      <td>129682</td>\n",
+       "      <td>0.2464</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>karen@momsrising.org</td>\n",
+       "      <td>1069368</td>\n",
+       "      <td>17870</td>\n",
+       "      <td>0.0362</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>dutch@freepress.net</td>\n",
+       "      <td>528607</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.0000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>kurt@demandprogress.org</td>\n",
+       "      <td>412792</td>\n",
+       "      <td>290372</td>\n",
+       "      <td>1.0000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>fccfreedom@hmamail.com</td>\n",
+       "      <td>207007</td>\n",
+       "      <td>122252</td>\n",
+       "      <td>1.0000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>advocacy@mozilla.com</td>\n",
+       "      <td>82926</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>action@aclu.org</td>\n",
+       "      <td>48733</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>meaghan@mandatemedia.com</td>\n",
+       "      <td>17317</td>\n",
+       "      <td>16267</td>\n",
+       "      <td>1.0000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>ncatalano@ofa.us</td>\n",
+       "      <td>12230</td>\n",
+       "      <td>12230</td>\n",
+       "      <td>1.0000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                            submissions  unique_emails  prop_with_email\n",
+       "uploader                                                               \n",
+       "esmisc@mac.com                  4347979        3966016           1.0000\n",
+       "shane@mediabridgellc.com        1856553        1501145           1.0000\n",
+       "mike@fightforthefuture.org      1464423         129682           0.2464\n",
+       "karen@momsrising.org            1069368          17870           0.0362\n",
+       "dutch@freepress.net              528607              3           0.0000\n",
+       "kurt@demandprogress.org          412792         290372           1.0000\n",
+       "fccfreedom@hmamail.com           207007         122252           1.0000\n",
+       "advocacy@mozilla.com              82926              0           0.0000\n",
+       "action@aclu.org                   48733              0           0.0000\n",
+       "meaghan@mandatemedia.com          17317          16267           1.0000\n",
+       "ncatalano@ofa.us                  12230          12230           1.0000"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "uploader_metrics = (\n",
+    "    bulk_uploads_17_108\n",
+    "    .assign(\n",
+    "        prop_with_email = lambda df: df[\"email_address\"].notnull()\n",
+    "    )\n",
+    "    .groupby(\"uploader\")\n",
+    "    .pipe(lambda grp: pd.DataFrame({\n",
+    "        \"submissions\": grp.size(),\n",
+    "        \"unique_emails\": grp[\"email_address\"].nunique(),        \n",
+    "        \"prop_with_email\": grp[\"prop_with_email\"].mean().round(4),\n",
+    "    }))\n",
+    ")\n",
+    "    \n",
+    "(\n",
+    "    uploader_metrics\n",
+    "    .sort_values(\"submissions\", ascending = False)\n",
+    "    .loc[lambda df: df[\"submissions\"] >= 10000]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Docket 17-108, all comments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id_submission</th>\n",
+       "      <th>date</th>\n",
+       "      <th>email_address_nonstandard</th>\n",
+       "      <th>email_address</th>\n",
+       "      <th>email_domain</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>04272972619149</td>\n",
+       "      <td>2017-04-27</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>0427547924954</td>\n",
+       "      <td>2017-04-27</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>10427918117987</td>\n",
+       "      <td>2017-04-27</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1f4830aa-726c-4206-9bef-cb3f2a57bb20</td>\n",
+       "      <td>gmail.com</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>10427080530667</td>\n",
+       "      <td>2017-04-27</td>\n",
+       "      <td>0</td>\n",
+       "      <td>f10d9c2b-2c98-44c2-9c7a-fe57b96930d8</td>\n",
+       "      <td>gmail.com</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>1042709110034</td>\n",
+       "      <td>2017-04-27</td>\n",
+       "      <td>0</td>\n",
+       "      <td>a6609a29-4b4c-4857-9e42-a886f61b8aaa</td>\n",
+       "      <td>d6b158e4-d116-4944-ab88-73091f1fc465</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    id_submission        date  email_address_nonstandard  \\\n",
+       "0  04272972619149  2017-04-27                          0   \n",
+       "1   0427547924954  2017-04-27                          0   \n",
+       "2  10427918117987  2017-04-27                          0   \n",
+       "3  10427080530667  2017-04-27                          0   \n",
+       "4   1042709110034  2017-04-27                          0   \n",
+       "\n",
+       "                          email_address                          email_domain  \n",
+       "0                                   NaN                                   NaN  \n",
+       "1                                   NaN                                   NaN  \n",
+       "2  1f4830aa-726c-4206-9bef-cb3f2a57bb20                             gmail.com  \n",
+       "3  f10d9c2b-2c98-44c2-9c7a-fe57b96930d8                             gmail.com  \n",
+       "4  a6609a29-4b4c-4857-9e42-a886f61b8aaa  d6b158e4-d116-4944-ab88-73091f1fc465  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "comments_17_108 = (\n",
+    "    load_comments(BASE_PATH + \"comments-17-108-with-uuids.csv\")\n",
+    "    .assign(date = lambda df: df[\"date\"].str.slice(0, 10))\n",
+    ")\n",
+    "\n",
+    "comments_17_108.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Examine email domains attributed to these comments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "      <th>unique_addresses</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>email_domain</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>gmail.com</td>\n",
+       "      <td>5000687</td>\n",
+       "      <td>4160788</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>yahoo.com</td>\n",
+       "      <td>2536892</td>\n",
+       "      <td>2126544</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>hotmail.com</td>\n",
+       "      <td>673018</td>\n",
+       "      <td>571156</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>aol.com</td>\n",
+       "      <td>632971</td>\n",
+       "      <td>508087</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>pornhub.com</td>\n",
+       "      <td>1030003</td>\n",
+       "      <td>233516</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>comcast.net</td>\n",
+       "      <td>208512</td>\n",
+       "      <td>158939</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>icloud.com</td>\n",
+       "      <td>106442</td>\n",
+       "      <td>91091</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>msn.com</td>\n",
+       "      <td>110056</td>\n",
+       "      <td>89398</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>hurra.de</td>\n",
+       "      <td>363357</td>\n",
+       "      <td>88571</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>outlook.com</td>\n",
+       "      <td>79411</td>\n",
+       "      <td>67890</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>att.net</td>\n",
+       "      <td>79823</td>\n",
+       "      <td>60640</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>live.com</td>\n",
+       "      <td>70139</td>\n",
+       "      <td>59210</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>sbcglobal.net</td>\n",
+       "      <td>70126</td>\n",
+       "      <td>51206</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>yahoo.fr</td>\n",
+       "      <td>93389</td>\n",
+       "      <td>48034</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>ymail.com</td>\n",
+       "      <td>45036</td>\n",
+       "      <td>37515</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>bellsouth.net</td>\n",
+       "      <td>40432</td>\n",
+       "      <td>32155</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>cox.net</td>\n",
+       "      <td>40137</td>\n",
+       "      <td>31260</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>verizon.net</td>\n",
+       "      <td>41933</td>\n",
+       "      <td>29236</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>yahoo.de</td>\n",
+       "      <td>97977</td>\n",
+       "      <td>28310</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>mail.ru</td>\n",
+       "      <td>60608</td>\n",
+       "      <td>24570</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>me.com</td>\n",
+       "      <td>26000</td>\n",
+       "      <td>19559</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>charter.net</td>\n",
+       "      <td>24425</td>\n",
+       "      <td>18487</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>einrot.com</td>\n",
+       "      <td>793148</td>\n",
+       "      <td>17091</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>gustr.com</td>\n",
+       "      <td>769010</td>\n",
+       "      <td>16813</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>rhyta.com</td>\n",
+       "      <td>773757</td>\n",
+       "      <td>16756</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>jourrapide.com</td>\n",
+       "      <td>782650</td>\n",
+       "      <td>16746</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>armyspy.com</td>\n",
+       "      <td>780664</td>\n",
+       "      <td>16741</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>dayrep.com</td>\n",
+       "      <td>770023</td>\n",
+       "      <td>16733</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>superrito.com</td>\n",
+       "      <td>767495</td>\n",
+       "      <td>16684</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>teleworm.us</td>\n",
+       "      <td>765488</td>\n",
+       "      <td>16673</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>cuvox.de</td>\n",
+       "      <td>775904</td>\n",
+       "      <td>16623</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>fleckens.hu</td>\n",
+       "      <td>776092</td>\n",
+       "      <td>16600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>mail.com</td>\n",
+       "      <td>16392</td>\n",
+       "      <td>14657</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>rocketmail.com</td>\n",
+       "      <td>17112</td>\n",
+       "      <td>14266</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>windstream.net</td>\n",
+       "      <td>13496</td>\n",
+       "      <td>11107</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>earthlink.net</td>\n",
+       "      <td>18068</td>\n",
+       "      <td>11088</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  count  unique_addresses\n",
+       "email_domain                             \n",
+       "gmail.com       5000687           4160788\n",
+       "yahoo.com       2536892           2126544\n",
+       "hotmail.com      673018            571156\n",
+       "aol.com          632971            508087\n",
+       "pornhub.com     1030003            233516\n",
+       "comcast.net      208512            158939\n",
+       "icloud.com       106442             91091\n",
+       "msn.com          110056             89398\n",
+       "hurra.de         363357             88571\n",
+       "outlook.com       79411             67890\n",
+       "att.net           79823             60640\n",
+       "live.com          70139             59210\n",
+       "sbcglobal.net     70126             51206\n",
+       "yahoo.fr          93389             48034\n",
+       "ymail.com         45036             37515\n",
+       "bellsouth.net     40432             32155\n",
+       "cox.net           40137             31260\n",
+       "verizon.net       41933             29236\n",
+       "yahoo.de          97977             28310\n",
+       "mail.ru           60608             24570\n",
+       "me.com            26000             19559\n",
+       "charter.net       24425             18487\n",
+       "einrot.com       793148             17091\n",
+       "gustr.com        769010             16813\n",
+       "rhyta.com        773757             16756\n",
+       "jourrapide.com   782650             16746\n",
+       "armyspy.com      780664             16741\n",
+       "dayrep.com       770023             16733\n",
+       "superrito.com    767495             16684\n",
+       "teleworm.us      765488             16673\n",
+       "cuvox.de         775904             16623\n",
+       "fleckens.hu      776092             16600\n",
+       "mail.com          16392             14657\n",
+       "rocketmail.com    17112             14266\n",
+       "windstream.net    13496             11107\n",
+       "earthlink.net     18068             11088"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "email_domains_17_108 = (\n",
+    "    comments_17_108\n",
+    "    .groupby([ \"email_domain\" ])\n",
+    "    .pipe(lambda grp: pd.DataFrame({\n",
+    "        \"count\": grp.size(),\n",
+    "        \"unique_addresses\": grp[\"email_address\"].nunique()\n",
+    "    }))\n",
+    "    .sort_values([ \"count\", \"unique_addresses\" ], ascending = False)\n",
+    ")\n",
+    "\n",
+    "(\n",
+    "    email_domains_17_108\n",
+    "    .loc[lambda df: df[\"unique_addresses\"] >= 10000]\n",
+    "    .sort_values(\"unique_addresses\", ascending = False)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we count the comments and unique email addresses associated with FakeMailGenerator.com:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FAKEMAIL_DOMAINS = [\n",
+    "    \"einrot.com\",\n",
+    "    \"jourrapide.com\",\n",
+    "    \"armyspy.com\",\n",
+    "    \"fleckens.hu\",\n",
+    "    \"cuvox.de\",\n",
+    "    \"rhyta.com\",\n",
+    "    \"dayrep.com\",\n",
+    "    \"gustr.com\",\n",
+    "    \"superrito.com\",\n",
+    "    \"teleworm.us\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>count</td>\n",
+       "      <td>7754231</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>unique_addresses</td>\n",
+       "      <td>167460</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    count\n",
+       "count             7754231\n",
+       "unique_addresses   167460"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    email_domains_17_108\n",
+    "    .loc[FAKEMAIL_DOMAINS]\n",
+    "    [[\"count\", \"unique_addresses\"]]\n",
+    "    .sum()\n",
+    "    .to_frame(\"count\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Docket 16-42"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>date</th>\n",
+       "      <th>id_submission</th>\n",
+       "      <th>comments</th>\n",
+       "      <th>email_address_nonstandard</th>\n",
+       "      <th>email_address</th>\n",
+       "      <th>email_domain</th>\n",
+       "      <th>name_and_location</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>2016-02-19</td>\n",
+       "      <td>60001483702</td>\n",
+       "      <td>60001515146.txtThank you!  Very pleased to see...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>8ad10c4e-1354-42ba-83f1-be6b3c89f331</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>2016-02-22</td>\n",
+       "      <td>60001484317</td>\n",
+       "      <td>60001843102.txt[5/23/2016 7:55:30 PM]The excha...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>310d8308-43a0-4b84-93dc-6662acdef829</td>\n",
+       "      <td>gmail.com</td>\n",
+       "      <td>d984ccab-11bb-4994-bcfe-f0d407fd03b5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>2016-02-25</td>\n",
+       "      <td>60001486876</td>\n",
+       "      <td>60001518518.txtPlease eliminate the cable TV b...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>7e6087df-a7a2-414f-8ebb-3be229805bec</td>\n",
+       "      <td>yahoo.com</td>\n",
+       "      <td>298bb4d9-8130-4ced-86e9-6a5d0c740c66</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>2016-02-27</td>\n",
+       "      <td>60001489444</td>\n",
+       "      <td>I?support?the?FCC?allowing?homeowners?to?be?fr...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>12614861-1a8f-4313-aeff-2366bcf18ca8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>2016-02-29</td>\n",
+       "      <td>60001492083</td>\n",
+       "      <td>60001523826.txtAs a consumer, I agree with the...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>10268573-9386-42c7-ab31-4d76641e76ed</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         date id_submission  \\\n",
+       "0  2016-02-19   60001483702   \n",
+       "1  2016-02-22   60001484317   \n",
+       "2  2016-02-25   60001486876   \n",
+       "3  2016-02-27   60001489444   \n",
+       "4  2016-02-29   60001492083   \n",
+       "\n",
+       "                                            comments  \\\n",
+       "0  60001515146.txtThank you!  Very pleased to see...   \n",
+       "1  60001843102.txt[5/23/2016 7:55:30 PM]The excha...   \n",
+       "2  60001518518.txtPlease eliminate the cable TV b...   \n",
+       "3  I?support?the?FCC?allowing?homeowners?to?be?fr...   \n",
+       "4  60001523826.txtAs a consumer, I agree with the...   \n",
+       "\n",
+       "   email_address_nonstandard                         email_address  \\\n",
+       "0                          0                                   NaN   \n",
+       "1                          0  310d8308-43a0-4b84-93dc-6662acdef829   \n",
+       "2                          0  7e6087df-a7a2-414f-8ebb-3be229805bec   \n",
+       "3                          0                                   NaN   \n",
+       "4                          0                                   NaN   \n",
+       "\n",
+       "  email_domain                     name_and_location  \n",
+       "0          NaN  8ad10c4e-1354-42ba-83f1-be6b3c89f331  \n",
+       "1    gmail.com  d984ccab-11bb-4994-bcfe-f0d407fd03b5  \n",
+       "2    yahoo.com  298bb4d9-8130-4ced-86e9-6a5d0c740c66  \n",
+       "3          NaN  12614861-1a8f-4313-aeff-2366bcf18ca8  \n",
+       "4          NaN  10268573-9386-42c7-ab31-4d76641e76ed  "
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "comments_16_42 = (\n",
+    "    load_comments(BASE_PATH + \"comments-16-42-with-uuids.csv\")\n",
+    "    .assign(date = lambda df: df[\"date\"].str.slice(0, 10))\n",
+    ")    \n",
+    "\n",
+    "comments_16_42.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Docket 14-28"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>date</th>\n",
+       "      <th>id_submission</th>\n",
+       "      <th>comments</th>\n",
+       "      <th>email_address_nonstandard</th>\n",
+       "      <th>email_address</th>\n",
+       "      <th>email_domain</th>\n",
+       "      <th>name_and_location</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>2014-02-21</td>\n",
+       "      <td>6017589853</td>\n",
+       "      <td>7521074305.txt Reclassify The Internet As A Co...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>a0fad65b-1482-427d-b300-da8e63d14272</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>2014-02-21</td>\n",
+       "      <td>6017589866</td>\n",
+       "      <td>7521074318.txt Reclassify The Internet As A Co...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>45954c7c-d52d-48f0-a252-343b4f82e509</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>2014-02-21</td>\n",
+       "      <td>6017589903</td>\n",
+       "      <td>7521074355.txt Reclassify The Internet As A Co...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0a1dea8a-3ae6-434f-be03-4b6447c3190c</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>2014-02-21</td>\n",
+       "      <td>6017589904</td>\n",
+       "      <td>7521074356.txt Reclassify The Internet As A Co...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>5cb3b14f-71a4-4763-8bd1-40ad440c5eb8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>2014-02-21</td>\n",
+       "      <td>6017589924</td>\n",
+       "      <td>7521074376.txt Reclassify The Internet As A Co...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0de47b78-a256-4e36-93ad-1f4830b07c48</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         date id_submission  \\\n",
+       "0  2014-02-21    6017589853   \n",
+       "1  2014-02-21    6017589866   \n",
+       "2  2014-02-21    6017589903   \n",
+       "3  2014-02-21    6017589904   \n",
+       "4  2014-02-21    6017589924   \n",
+       "\n",
+       "                                            comments  \\\n",
+       "0  7521074305.txt Reclassify The Internet As A Co...   \n",
+       "1  7521074318.txt Reclassify The Internet As A Co...   \n",
+       "2  7521074355.txt Reclassify The Internet As A Co...   \n",
+       "3  7521074356.txt Reclassify The Internet As A Co...   \n",
+       "4  7521074376.txt Reclassify The Internet As A Co...   \n",
+       "\n",
+       "   email_address_nonstandard email_address email_domain  \\\n",
+       "0                          0           NaN          NaN   \n",
+       "1                          0           NaN          NaN   \n",
+       "2                          0           NaN          NaN   \n",
+       "3                          0           NaN          NaN   \n",
+       "4                          0           NaN          NaN   \n",
+       "\n",
+       "                      name_and_location  \n",
+       "0  a0fad65b-1482-427d-b300-da8e63d14272  \n",
+       "1  45954c7c-d52d-48f0-a252-343b4f82e509  \n",
+       "2  0a1dea8a-3ae6-434f-be03-4b6447c3190c  \n",
+       "3  5cb3b14f-71a4-4763-8bd1-40ad440c5eb8  \n",
+       "4  0de47b78-a256-4e36-93ad-1f4830b07c48  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "comments_14_28 = (\n",
+    "    load_comments(BASE_PATH + \"comments-14-28-with-uuids.csv\")\n",
+    "    .assign(date = lambda df: df[\"date\"].str.slice(0, 10))\n",
+    ")    \n",
+    "\n",
+    "comments_14_28.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Classify comments\n",
+    "\n",
+    "In this step, we create derivative dataframes that classify each comment based on the language used in them. (Note: Because the formatting of comments can be inconsistent, the classification approach ignores whitespace.)\n",
+    "\n",
+    "The classifier takes a series of texts and a series of patterns to look for. Each text is labeled based on the __first__ pattern it matches, based on the sequential order of the patterns; if the text matches no pattern, it is labeled `[other]`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def classify(texts, patterns):\n",
+    "    # Create a copy of the texts and remove whitespace\n",
+    "    s = texts.copy().str.replace(r\"\\s+\", \"\")\n",
+    "    \n",
+    "    # Remove whitespace from classification patterns\n",
+    "    without_whitespace = [ (re.sub(r\"\\s+\", \"\", pat), val)\n",
+    "        for pat, val in patterns ]\n",
+    "    \n",
+    "    # An empty series, indexed identically to the original texts.\n",
+    "    ix = pd.Series(None, index = texts.index)\n",
+    "\n",
+    "    # As we progress through the matching, we will gradually\n",
+    "    # fill `ix` in with the matches we've found.\n",
+    "    \n",
+    "    # Iterate through the classification patterns\n",
+    "    for pat, val in without_whitespace:\n",
+    "        # Determine which texts match\n",
+    "        search_result = s.str.contains(pat, na = False)\n",
+    "        matches = search_result.loc[lambda x: x == True]\n",
+    "        \n",
+    "        # For matches, update `ix` to indicate the pattern ID/description\n",
+    "        ix.loc[matches.index] = val\n",
+    "        \n",
+    "        # Subset `s` so that it only contains unmatched texts\n",
+    "        s = s.loc[s.index.difference(matches.index)]\n",
+    "\n",
+    "    return ix.fillna(\"[other]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_classification(df, patterns):\n",
+    "    return (\n",
+    "        df\n",
+    "        .assign(group = lambda df: (\n",
+    "            df[\"comments\"]\n",
+    "            .pipe(classify, patterns)\n",
+    "        ))\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_example_comments(df, n = 3, max_chars = 500):\n",
+    "    for grp, subdf in df.groupby(\"group\"):\n",
+    "        print(f\"=== {grp} ===\\n\")\n",
+    "        \n",
+    "        examples = (\n",
+    "            subdf[\"comments\"]\n",
+    "            .sample(n, random_state = RANDOM_STATE)\n",
+    "            .pipe(lambda x: pd.np.where(\n",
+    "                x.apply(len) > max_chars,\n",
+    "                x.str.slice(0, max_chars) + \"[...]\",\n",
+    "                x\n",
+    "            ))\n",
+    "        )\n",
+    "        \n",
+    "        print(\"\\n\\n\".join(examples) + \"\\n\\n\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Docket 14-28\n",
+    "\n",
+    "BuzzFeed News identified the phrases below based on extensive examination of the 14-28 docket, and by cross-referencing them with [this December 2014 Sunlight Foundation analysis](http://web.archive.org/web/20150301070951/http://sunlightfoundation.com/blog/2014/12/16/one-group-dominates-the-second-round-of-net-neutrality-comments/).\n",
+    "\n",
+    "The `AC-` comments use language from American Commitment's comment campaign. It is possible that entities other than American Commitment submitted comments that used the same language. \n",
+    "\n",
+    "Note: The final phrase in the list below also appears alongside some of the other permutations; but because it is the final phrase in the list, only comments that don't match the other phrasings receive this classification.\n",
+    "\n",
+    "Please see this repository's landing page, and the associated BuzzFeed News article, for additional context. (E.g., not all comments are indivudally retreivable from the FCC's public portal.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ac_patterns_main = [\n",
+    "    # Earlier set of comments\n",
+    "    \"The federal government can use their power over the internet to direct content\", \n",
+    "    \"because of high barriers to entry and a disastrous lack of competition\", \n",
+    "    \"federal bureaucrats will slow down the process and protect prevailing interests first\", \n",
+    "    \"blossoming in America today, largely due to the internet\", \n",
+    "    \"Government will make it impossible for internet providers to upgrade service\", \n",
+    "    \"government will naturally favor entrenched special interests, rather than upstart companies\", \n",
+    "    \"will begin to be mismanaged, like many other government-run industries\", \n",
+    "    \"put directly in the hands of bureaucrats, instead of the free market\", \n",
+    "    \"will result in worse service, even as costs continue to skyrocket\", \n",
+    "    \"Our options for cheap, high-speed, high-performing internet providers\", \n",
+    "\n",
+    "    # Later set of comments\n",
+    "    \"Left-wing extremists have been crying wolf\", \n",
+    "    \"The federal government needs to keep its hands off the Internet\", \n",
+    "    \"Before the FCC places regulatory handcuffs on Internet providers\", \n",
+    "    \"The notion that the internet is broken and needs repair is simply not true\", \n",
+    "    \"will send the crown jewel of the US economy into an economic tailspin\", \n",
+    "    \"no longer acting in the interests of the American people\", \n",
+    "    \"just another slow-moving government-controlled mess\", \n",
+    "    \"defend ourselves against power-hungry bureaucrats\", \n",
+    "    \"simply another attempt by the federal government to take control of another sector of the economy\", \n",
+    "    \"Millions of liberal fools demanding you reduce the Internet\", \n",
+    "    \"FCC is clearly ignoring the will of the American people\", \n",
+    "    \"devastate private investment with the force of an atomic bomb\", \n",
+    "    \"without being slowed by bureaucratic inertia\", \n",
+    "    \"A small fringe of the extremist left has been demanding\", \n",
+    "    \"a tiny minority of far-left political activists\", \n",
+    "    \"ultimate goal is to get rid of the media capitalists\", \n",
+    "    \"created economic and human wreckage in their wake\", \n",
+    "    \"increase its own power at the expense of the free people\", \n",
+    "    \"it will have proven itself to be an unaccountable agency\", \n",
+    "    \"subjecting it to 1930s-style regulations meant for telephone monopolies\", \n",
+    "    \"Government regulation of Internet services would chase investment\", \n",
+    "    \"it will seriously degrade the Internet we have\", \n",
+    "    \"it can and should suffer the consequences\", \n",
+    "    \"taking such reckless actions to gain control over the Internet\", \n",
+    "    \"simply is no evidence to back up the dire claims of disaster\", \n",
+    "]\n",
+    "\n",
+    "ac_patterns_other = [\n",
+    "    \"Like many Americans, I believe that the internet should remain free of government\",\n",
+    "    \"As an American citizen, I wanted to voice my opposition to the FCC\",\n",
+    "]\n",
+    "\n",
+    "ac_pattern_desciptions = (\n",
+    "    [ (p, f\"AC-{i:02d}\") for i, p in enumerate(ac_patterns_main) ] +\n",
+    "    [ (p, f\"AC-other\") for p in ac_patterns_other ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Notes:\n",
+    "\n",
+    "- The `AC-XX` classification names below are based simply on the order in which they appear above. The numbers have no independent meaning.\n",
+    "\n",
+    "- The `AC-other` classification indicates that key language (the two phrases in `ac_patterns_other` above) from American Commitment appears in the comment, but not any of the other phrases."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "AC-00          1261\n",
+       "AC-01          1233\n",
+       "AC-02          1246\n",
+       "AC-03          1232\n",
+       "AC-04          1269\n",
+       "AC-05          1208\n",
+       "AC-06          1210\n",
+       "AC-07          1207\n",
+       "AC-08          1202\n",
+       "AC-09          1186\n",
+       "AC-10         25801\n",
+       "AC-11         25781\n",
+       "AC-12         25951\n",
+       "AC-13         26012\n",
+       "AC-14         25667\n",
+       "AC-15         25879\n",
+       "AC-16         25727\n",
+       "AC-17         26009\n",
+       "AC-18         25658\n",
+       "AC-19         25788\n",
+       "AC-20         25924\n",
+       "AC-21         25914\n",
+       "AC-22         25950\n",
+       "AC-23         25865\n",
+       "AC-24         25864\n",
+       "AC-25         25620\n",
+       "AC-26         26044\n",
+       "AC-27         25932\n",
+       "AC-28         25745\n",
+       "AC-29         26024\n",
+       "AC-30         25624\n",
+       "AC-31         25880\n",
+       "AC-32         25691\n",
+       "AC-33         25615\n",
+       "AC-34         25836\n",
+       "AC-other          6\n",
+       "[other]     1396620\n",
+       "Name: group, dtype: int64"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "comments_14_28_classified = (\n",
+    "    comments_14_28\n",
+    "    .pipe(\n",
+    "        add_classification,\n",
+    "        ac_pattern_desciptions\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "(\n",
+    "    comments_14_28_classified\n",
+    "    [\"group\"]\n",
+    "    .value_counts()\n",
+    "    .sort_index()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The total number of comments and unique email addreses for all `AC-`-classified comments above:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>comments</td>\n",
+       "      <td>658061</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>unique_email_addresses</td>\n",
+       "      <td>551855</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                         count\n",
+       "comments                658061\n",
+       "unique_email_addresses  551855"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    comments_14_28_classified\n",
+    "    .loc[lambda df: df[\"group\"] != \"[other]\"]\n",
+    "    .pipe(lambda df: pd.Series({\n",
+    "        \"comments\": len(df),\n",
+    "        \"unique_email_addresses\": df[\"email_address\"].nunique()\n",
+    "    }))\n",
+    "    .to_frame(\"count\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Dates submitted\n",
+    "\n",
+    "The analysis below demonstrates that the following:\n",
+    "\n",
+    "- Comments `AC-00`-`AC-09` share a similar distribution of dates submitted\n",
+    "- Comments `AC-10`-`AC-34` also share a similar distribution of dates submitted, but distinct from `AC-00`-`AC-09`\n",
+    "\n",
+    "Additional notes:\n",
+    "\n",
+    "- Dates below are `MM-DD`, for 2014\n",
+    "\n",
+    "- Dates include only those with at least 200 total `AC-` classified comments (overall), to reduce noise of stray dates that contain relatively few matching comments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>date</th>\n",
+       "      <th>07-14</th>\n",
+       "      <th>07-16</th>\n",
+       "      <th>07-17</th>\n",
+       "      <th>09-11</th>\n",
+       "      <th>09-12</th>\n",
+       "      <th>09-13</th>\n",
+       "      <th>09-14</th>\n",
+       "      <th>09-15</th>\n",
+       "      <th>09-16</th>\n",
+       "      <th>09-17</th>\n",
+       "      <th>09-18</th>\n",
+       "      <th>09-19</th>\n",
+       "      <th>09-22</th>\n",
+       "      <th>09-23</th>\n",
+       "      <th>09-24</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>group</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>AC-00</td>\n",
+       "      <td>45</td>\n",
+       "      <td>909</td>\n",
+       "      <td>305</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-01</td>\n",
+       "      <td>39</td>\n",
+       "      <td>905</td>\n",
+       "      <td>287</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-02</td>\n",
+       "      <td>52</td>\n",
+       "      <td>888</td>\n",
+       "      <td>304</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-03</td>\n",
+       "      <td>45</td>\n",
+       "      <td>893</td>\n",
+       "      <td>293</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-04</td>\n",
+       "      <td>80</td>\n",
+       "      <td>902</td>\n",
+       "      <td>284</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-05</td>\n",
+       "      <td>36</td>\n",
+       "      <td>872</td>\n",
+       "      <td>300</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-06</td>\n",
+       "      <td>47</td>\n",
+       "      <td>906</td>\n",
+       "      <td>257</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-07</td>\n",
+       "      <td>48</td>\n",
+       "      <td>859</td>\n",
+       "      <td>299</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-08</td>\n",
+       "      <td>45</td>\n",
+       "      <td>854</td>\n",
+       "      <td>303</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-09</td>\n",
+       "      <td>44</td>\n",
+       "      <td>878</td>\n",
+       "      <td>263</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-10</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2478</td>\n",
+       "      <td>3896</td>\n",
+       "      <td>4591</td>\n",
+       "      <td>3510</td>\n",
+       "      <td>6458</td>\n",
+       "      <td>2167</td>\n",
+       "      <td>2143</td>\n",
+       "      <td>225</td>\n",
+       "      <td>63</td>\n",
+       "      <td>112</td>\n",
+       "      <td>80</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-11</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2483</td>\n",
+       "      <td>3911</td>\n",
+       "      <td>4613</td>\n",
+       "      <td>3574</td>\n",
+       "      <td>6365</td>\n",
+       "      <td>2204</td>\n",
+       "      <td>2097</td>\n",
+       "      <td>231</td>\n",
+       "      <td>74</td>\n",
+       "      <td>118</td>\n",
+       "      <td>75</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-12</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2481</td>\n",
+       "      <td>3897</td>\n",
+       "      <td>4577</td>\n",
+       "      <td>3550</td>\n",
+       "      <td>6527</td>\n",
+       "      <td>2207</td>\n",
+       "      <td>2197</td>\n",
+       "      <td>224</td>\n",
+       "      <td>76</td>\n",
+       "      <td>112</td>\n",
+       "      <td>79</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-13</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2488</td>\n",
+       "      <td>3909</td>\n",
+       "      <td>4604</td>\n",
+       "      <td>3605</td>\n",
+       "      <td>6438</td>\n",
+       "      <td>2252</td>\n",
+       "      <td>2180</td>\n",
+       "      <td>241</td>\n",
+       "      <td>72</td>\n",
+       "      <td>123</td>\n",
+       "      <td>82</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-14</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2470</td>\n",
+       "      <td>3911</td>\n",
+       "      <td>4528</td>\n",
+       "      <td>3541</td>\n",
+       "      <td>6278</td>\n",
+       "      <td>2234</td>\n",
+       "      <td>2168</td>\n",
+       "      <td>256</td>\n",
+       "      <td>71</td>\n",
+       "      <td>119</td>\n",
+       "      <td>75</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-15</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2470</td>\n",
+       "      <td>3893</td>\n",
+       "      <td>4572</td>\n",
+       "      <td>3540</td>\n",
+       "      <td>6451</td>\n",
+       "      <td>2269</td>\n",
+       "      <td>2144</td>\n",
+       "      <td>254</td>\n",
+       "      <td>75</td>\n",
+       "      <td>117</td>\n",
+       "      <td>77</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-16</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2477</td>\n",
+       "      <td>3917</td>\n",
+       "      <td>4562</td>\n",
+       "      <td>3541</td>\n",
+       "      <td>6385</td>\n",
+       "      <td>2196</td>\n",
+       "      <td>2144</td>\n",
+       "      <td>231</td>\n",
+       "      <td>69</td>\n",
+       "      <td>108</td>\n",
+       "      <td>83</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-17</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2496</td>\n",
+       "      <td>3907</td>\n",
+       "      <td>4509</td>\n",
+       "      <td>3558</td>\n",
+       "      <td>6511</td>\n",
+       "      <td>2249</td>\n",
+       "      <td>2290</td>\n",
+       "      <td>231</td>\n",
+       "      <td>64</td>\n",
+       "      <td>105</td>\n",
+       "      <td>74</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-18</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2498</td>\n",
+       "      <td>3903</td>\n",
+       "      <td>4569</td>\n",
+       "      <td>3584</td>\n",
+       "      <td>6352</td>\n",
+       "      <td>2190</td>\n",
+       "      <td>2067</td>\n",
+       "      <td>229</td>\n",
+       "      <td>55</td>\n",
+       "      <td>117</td>\n",
+       "      <td>80</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-19</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2475</td>\n",
+       "      <td>3924</td>\n",
+       "      <td>4525</td>\n",
+       "      <td>3491</td>\n",
+       "      <td>6523</td>\n",
+       "      <td>2183</td>\n",
+       "      <td>2165</td>\n",
+       "      <td>258</td>\n",
+       "      <td>58</td>\n",
+       "      <td>94</td>\n",
+       "      <td>77</td>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-20</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2481</td>\n",
+       "      <td>3910</td>\n",
+       "      <td>4568</td>\n",
+       "      <td>3572</td>\n",
+       "      <td>6562</td>\n",
+       "      <td>2153</td>\n",
+       "      <td>2150</td>\n",
+       "      <td>236</td>\n",
+       "      <td>86</td>\n",
+       "      <td>113</td>\n",
+       "      <td>79</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-21</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2476</td>\n",
+       "      <td>3900</td>\n",
+       "      <td>4573</td>\n",
+       "      <td>3463</td>\n",
+       "      <td>6493</td>\n",
+       "      <td>2297</td>\n",
+       "      <td>2188</td>\n",
+       "      <td>239</td>\n",
+       "      <td>75</td>\n",
+       "      <td>113</td>\n",
+       "      <td>83</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-22</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2486</td>\n",
+       "      <td>3886</td>\n",
+       "      <td>4557</td>\n",
+       "      <td>3522</td>\n",
+       "      <td>6507</td>\n",
+       "      <td>2225</td>\n",
+       "      <td>2240</td>\n",
+       "      <td>241</td>\n",
+       "      <td>82</td>\n",
+       "      <td>110</td>\n",
+       "      <td>81</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-23</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2484</td>\n",
+       "      <td>3917</td>\n",
+       "      <td>4612</td>\n",
+       "      <td>3492</td>\n",
+       "      <td>6441</td>\n",
+       "      <td>2171</td>\n",
+       "      <td>2246</td>\n",
+       "      <td>235</td>\n",
+       "      <td>68</td>\n",
+       "      <td>108</td>\n",
+       "      <td>77</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-24</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2469</td>\n",
+       "      <td>3908</td>\n",
+       "      <td>4637</td>\n",
+       "      <td>3511</td>\n",
+       "      <td>6507</td>\n",
+       "      <td>2175</td>\n",
+       "      <td>2169</td>\n",
+       "      <td>222</td>\n",
+       "      <td>75</td>\n",
+       "      <td>100</td>\n",
+       "      <td>78</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-25</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2472</td>\n",
+       "      <td>3900</td>\n",
+       "      <td>4562</td>\n",
+       "      <td>3482</td>\n",
+       "      <td>6426</td>\n",
+       "      <td>2099</td>\n",
+       "      <td>2191</td>\n",
+       "      <td>240</td>\n",
+       "      <td>64</td>\n",
+       "      <td>95</td>\n",
+       "      <td>78</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-26</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2491</td>\n",
+       "      <td>3916</td>\n",
+       "      <td>4581</td>\n",
+       "      <td>3575</td>\n",
+       "      <td>6462</td>\n",
+       "      <td>2257</td>\n",
+       "      <td>2210</td>\n",
+       "      <td>248</td>\n",
+       "      <td>82</td>\n",
+       "      <td>131</td>\n",
+       "      <td>78</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-27</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2483</td>\n",
+       "      <td>3896</td>\n",
+       "      <td>4599</td>\n",
+       "      <td>3630</td>\n",
+       "      <td>6440</td>\n",
+       "      <td>2195</td>\n",
+       "      <td>2192</td>\n",
+       "      <td>241</td>\n",
+       "      <td>70</td>\n",
+       "      <td>95</td>\n",
+       "      <td>76</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-28</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2475</td>\n",
+       "      <td>3914</td>\n",
+       "      <td>4566</td>\n",
+       "      <td>3542</td>\n",
+       "      <td>6366</td>\n",
+       "      <td>2233</td>\n",
+       "      <td>2140</td>\n",
+       "      <td>233</td>\n",
+       "      <td>62</td>\n",
+       "      <td>123</td>\n",
+       "      <td>75</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-29</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2477</td>\n",
+       "      <td>3896</td>\n",
+       "      <td>4651</td>\n",
+       "      <td>3489</td>\n",
+       "      <td>6560</td>\n",
+       "      <td>2213</td>\n",
+       "      <td>2228</td>\n",
+       "      <td>244</td>\n",
+       "      <td>64</td>\n",
+       "      <td>112</td>\n",
+       "      <td>77</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-30</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2501</td>\n",
+       "      <td>3896</td>\n",
+       "      <td>4585</td>\n",
+       "      <td>3554</td>\n",
+       "      <td>6332</td>\n",
+       "      <td>2184</td>\n",
+       "      <td>2100</td>\n",
+       "      <td>220</td>\n",
+       "      <td>56</td>\n",
+       "      <td>101</td>\n",
+       "      <td>78</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-31</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2490</td>\n",
+       "      <td>3907</td>\n",
+       "      <td>4617</td>\n",
+       "      <td>3526</td>\n",
+       "      <td>6434</td>\n",
+       "      <td>2207</td>\n",
+       "      <td>2179</td>\n",
+       "      <td>245</td>\n",
+       "      <td>71</td>\n",
+       "      <td>107</td>\n",
+       "      <td>83</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-32</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2477</td>\n",
+       "      <td>3896</td>\n",
+       "      <td>4655</td>\n",
+       "      <td>3462</td>\n",
+       "      <td>6367</td>\n",
+       "      <td>2175</td>\n",
+       "      <td>2161</td>\n",
+       "      <td>233</td>\n",
+       "      <td>56</td>\n",
+       "      <td>119</td>\n",
+       "      <td>78</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-33</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2483</td>\n",
+       "      <td>3900</td>\n",
+       "      <td>4544</td>\n",
+       "      <td>3494</td>\n",
+       "      <td>6351</td>\n",
+       "      <td>2205</td>\n",
+       "      <td>2161</td>\n",
+       "      <td>218</td>\n",
+       "      <td>68</td>\n",
+       "      <td>98</td>\n",
+       "      <td>79</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-34</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2477</td>\n",
+       "      <td>3886</td>\n",
+       "      <td>4588</td>\n",
+       "      <td>3572</td>\n",
+       "      <td>6441</td>\n",
+       "      <td>2182</td>\n",
+       "      <td>2173</td>\n",
+       "      <td>243</td>\n",
+       "      <td>74</td>\n",
+       "      <td>107</td>\n",
+       "      <td>81</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>AC-other</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "date      07-14  07-16  07-17  09-11  09-12  09-13  09-14  09-15  09-16  \\\n",
+       "group                                                                     \n",
+       "AC-00        45    909    305      0      0      0      0      0      0   \n",
+       "AC-01        39    905    287      0      0      0      0      0      0   \n",
+       "AC-02        52    888    304      0      0      0      0      0      0   \n",
+       "AC-03        45    893    293      0      0      0      0      0      0   \n",
+       "AC-04        80    902    284      0      0      0      0      0      0   \n",
+       "AC-05        36    872    300      0      0      0      0      0      0   \n",
+       "AC-06        47    906    257      0      0      0      0      0      0   \n",
+       "AC-07        48    859    299      0      0      0      0      0      0   \n",
+       "AC-08        45    854    303      0      0      0      0      0      0   \n",
+       "AC-09        44    878    263      0      0      0      0      0      0   \n",
+       "AC-10         0      0      0   2478   3896   4591   3510   6458   2167   \n",
+       "AC-11         0      0      0   2483   3911   4613   3574   6365   2204   \n",
+       "AC-12         0      0      0   2481   3897   4577   3550   6527   2207   \n",
+       "AC-13         0      0      0   2488   3909   4604   3605   6438   2252   \n",
+       "AC-14         0      0      0   2470   3911   4528   3541   6278   2234   \n",
+       "AC-15         0      0      0   2470   3893   4572   3540   6451   2269   \n",
+       "AC-16         0      0      0   2477   3917   4562   3541   6385   2196   \n",
+       "AC-17         0      0      0   2496   3907   4509   3558   6511   2249   \n",
+       "AC-18         0      0      0   2498   3903   4569   3584   6352   2190   \n",
+       "AC-19         0      0      0   2475   3924   4525   3491   6523   2183   \n",
+       "AC-20         0      0      0   2481   3910   4568   3572   6562   2153   \n",
+       "AC-21         0      0      0   2476   3900   4573   3463   6493   2297   \n",
+       "AC-22         0      0      0   2486   3886   4557   3522   6507   2225   \n",
+       "AC-23         0      0      0   2484   3917   4612   3492   6441   2171   \n",
+       "AC-24         0      0      0   2469   3908   4637   3511   6507   2175   \n",
+       "AC-25         0      0      0   2472   3900   4562   3482   6426   2099   \n",
+       "AC-26         0      0      0   2491   3916   4581   3575   6462   2257   \n",
+       "AC-27         0      0      0   2483   3896   4599   3630   6440   2195   \n",
+       "AC-28         0      0      0   2475   3914   4566   3542   6366   2233   \n",
+       "AC-29         0      0      0   2477   3896   4651   3489   6560   2213   \n",
+       "AC-30         0      0      0   2501   3896   4585   3554   6332   2184   \n",
+       "AC-31         0      0      0   2490   3907   4617   3526   6434   2207   \n",
+       "AC-32         0      0      0   2477   3896   4655   3462   6367   2175   \n",
+       "AC-33         0      0      0   2483   3900   4544   3494   6351   2205   \n",
+       "AC-34         0      0      0   2477   3886   4588   3572   6441   2182   \n",
+       "AC-other      0      0      0      0      0      0      0      0      0   \n",
+       "\n",
+       "date      09-17  09-18  09-19  09-22  09-23  09-24  \n",
+       "group                                               \n",
+       "AC-00         0      0      0      0      0      0  \n",
+       "AC-01         0      0      0      0      0      0  \n",
+       "AC-02         0      0      0      0      0      0  \n",
+       "AC-03         0      0      0      0      0      0  \n",
+       "AC-04         0      0      0      0      0      0  \n",
+       "AC-05         0      0      0      0      0      0  \n",
+       "AC-06         0      0      0      0      0      0  \n",
+       "AC-07         0      0      0      0      0      0  \n",
+       "AC-08         0      0      0      0      0      0  \n",
+       "AC-09         0      0      0      0      0      0  \n",
+       "AC-10      2143    225     63    112     80     11  \n",
+       "AC-11      2097    231     74    118     75     12  \n",
+       "AC-12      2197    224     76    112     79     12  \n",
+       "AC-13      2180    241     72    123     82     12  \n",
+       "AC-14      2168    256     71    119     75     11  \n",
+       "AC-15      2144    254     75    117     77     11  \n",
+       "AC-16      2144    231     69    108     83     12  \n",
+       "AC-17      2290    231     64    105     74     11  \n",
+       "AC-18      2067    229     55    117     80     12  \n",
+       "AC-19      2165    258     58     94     77     14  \n",
+       "AC-20      2150    236     86    113     79     12  \n",
+       "AC-21      2188    239     75    113     83     12  \n",
+       "AC-22      2240    241     82    110     81     13  \n",
+       "AC-23      2246    235     68    108     77     12  \n",
+       "AC-24      2169    222     75    100     78     12  \n",
+       "AC-25      2191    240     64     95     78     11  \n",
+       "AC-26      2210    248     82    131     78     13  \n",
+       "AC-27      2192    241     70     95     76     13  \n",
+       "AC-28      2140    233     62    123     75     11  \n",
+       "AC-29      2228    244     64    112     77     12  \n",
+       "AC-30      2100    220     56    101     78     13  \n",
+       "AC-31      2179    245     71    107     83     13  \n",
+       "AC-32      2161    233     56    119     78     11  \n",
+       "AC-33      2161    218     68     98     79     12  \n",
+       "AC-34      2173    243     74    107     81     12  \n",
+       "AC-other      0      0      0      0      0      0  "
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    comments_14_28_classified\n",
+    "    .loc[lambda df: df[\"group\"] != \"[other]\"]\n",
+    "    .assign(\n",
+    "        date = lambda df: df[\"date\"].str.slice(5, 10)\n",
+    "    )\n",
+    "    .groupby([\"group\", \"date\"])\n",
+    "    .size()\n",
+    "    .unstack()\n",
+    "    .fillna(0)\n",
+    "    .astype(int)\n",
+    "    .loc[:, lambda df: df.sum() >= 200]\n",
+    "    \n",
+    "    # Order columns by date\n",
+    "    .pipe(lambda df: df[[c for c in sorted(df.columns)]])\n",
+    "    \n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Example comments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== AC-10 ===\n",
+      "\n",
+      "The Internet is not broken, and does not need to be fixed.  Left-wing extremists have been crying wolf for the past decade about the harm to the Internet if the Federal government didn?t regulate it.  Not only were they wrong, but the Internet has exploded with innovation.  Do not regulate the Internet.  The best way to keep it open and free is what has kept it open and free all along ? no government intervention.\n",
+      "\n",
+      "The Internet is not broken, and does not need to be fixed.  Left-wing extremists have been crying wolf for the past decade about the harm to the Internet if the Federal government didn?t regulate it.  Not only were they wrong, but the Internet has exploded with innovation.  Do not regulate the Internet.  The best way to keep it open and free is what has kept it open and free all along ? no government intervention.\n",
+      "\n",
+      "The Internet is not broken, and does not need to be fixed.  Left-wing extremists have been crying wolf for the past decade about the harm to the Internet if the Federal government didn?t regulate it.  Not only were they wrong, but the Internet has exploded with innovation.  Do not regulate the Internet.  The best way to keep it open and free is what has kept it open and free all along ? no government intervention.\n",
+      "\n",
+      "\n",
+      "=== AC-27 ===\n",
+      "\n",
+      "The American people are watching a Federal Communications Commission (FCC) that is not seeking to make the Internet better, but instead seeking to regulate it.  The FCC could improve broadband delivery by auctioning off much-needed spectrum.  Or it could get rid of some of its own burdensome rules that prevent companies from attracting investors and innovating.  But instead of doing these things that would improve the Internet, the FCC is wasting its time in an obsessive drive to regulate. This [...]\n",
+      "\n",
+      "The American people are watching a Federal Communications Commission (FCC) that is not seeking to make the Internet better, but instead seeking to regulate it.  The FCC could improve broadband delivery by auctioning off much-needed spectrum.  Or it could get rid of some of its own burdensome rules that prevent companies from attracting investors and innovating.  But instead of doing these things that would improve the Internet, the FCC is wasting its time in an obsessive drive to regulate.  This[...]\n",
+      "\n",
+      "7522706506.txtThe American people are watching a Federal Communications Commission (FCC) that is not seeking to make the Internet better, but instead seeking to regulate it.  The FCC could improve broadband delivery by auctioning off much-needed spectrum.  Or it could get rid of some of its own burdensome rules that prevent companies from attracting investors and innovating.  But instead of doing these things that would improve the Internet, the FCC is wasting its time in an obsessive drive to r[...]\n",
+      "\n",
+      "\n",
+      "=== [other] ===\n",
+      "\n",
+      "Dear Chairman Wheeler:We are writing to urge you to implement strong and unambiguous net neutrality rules that protect the Internet from discrimination and other practices that will impede its ability to serve our democracy, empower consumers, and fuel economic growth. Erecting toll booths or designating fast lanes on the information superhighway wouldstifle free speech, limit consumer choice, and thwart innovation.  The FCC must act in a clear and decisive way to ensure the Internet does not be[...]\n",
+      "\n",
+      "Dear Chairman Wheeler:We are writing to urge you to implement strong and unambiguous net neutrality rules that protect the Internet from discrimination and other practices that will impede its ability to serve our democracy, empower consumers, and fuel economic growth. Erecting toll booths or designating fast lanes on the information superhighway wouldstifle free speech, limit consumer choice, and thwart innovation.  The FCC must act in a clear and decisive way to ensure the Internet does not be[...]\n",
+      "\n",
+      "7522187451.txtDear Chairman Wheeler:We are writing to urge you to implement strong and unambiguous net neutrality rules that protect the Internet from discrimination and other practices that will impede its ability to serve our democracy, empower consumers, and fuel economic growth. Erecting toll booths or designating fast lanes on the information superhighway wouldstifle free speech, limit consumer choice, and thwart innovation.  The FCC must act in a clear and decisive way to ensure the Intern[...]\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_example_comments(\n",
+    "    comments_14_28_classified\n",
+    "    .loc[lambda df: df[\"group\"].isin([\"AC-10\", \"AC-27\", \"[other]\"])],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compare the above timing and language for group `AC-27` to the [comment attributed to Annie Reeves](https://www.fcc.gov/ecfs/filing/6019076835), received by the FCC on September 15, 2014:\n",
+    "\n",
+    "> The American people are watching a Federal Communications Commission (FCC) that is not seeking to make the Internet better, but instead seeking to regulate it. The FCC could improve broadband delivery by auctioning off much-needed spectrum. Or it could get rid of some of its own burdensome rules that prevent companies from attracting investors and innovating. But instead of doing these things that would improve the Internet, the FCC is wasting its time in an obsessive drive to regulate. This tells the American people that once again, a Washington agency is working in aself-interested way to increase its own power at the expense of the free people it is meant to serve."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Docket 16-42\n",
+    "\n",
+    "Here, we identify two very large sets of comments in this docket, by searching for the short phrases below. Please see the associated BuzzFeed News article for context.\n",
+    "\n",
+    "The \"American Commitment\" set of comments is labeled as such because it uses language from [that organization's comment campaign](http://web.archive.org/web/20160403182941/https://www.americancommitment.org/cablebox-petition). (The text of comments appear to be generated algorithmically, selecting randomly from sets of pre-selected words and phrases, but the phrase used here for classification is static — it does not change across the comments.) To be sure, it is possible the comments were submitted by entities other than American Commitment, using the same language; the FCC's public portal does not specify who submitted these comments. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'over $200'            104816\n",
+       "American Commitment    101783\n",
+       "[other]                 75175\n",
+       "Name: group, dtype: int64"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "comments_16_42_classified = (\n",
+    "    comments_16_42\n",
+    "    .pipe(\n",
+    "        add_classification,\n",
+    "        [\n",
+    "            (\"cloud-based video on demand, and apps providing news\", \"American Commitment\"),\n",
+    "            (\"A cable subscriber pays over \\$200\", \"'over $200'\"),\n",
+    "        ]\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "(\n",
+    "    comments_16_42_classified\n",
+    "    [\"group\"]\n",
+    "    .value_counts()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Dates submitted, by two main groups of comments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2016-04-19    10499\n",
+       "2016-04-20    59247\n",
+       "2016-04-21    35070\n",
+       "Name: date, dtype: int64"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    comments_16_42_classified\n",
+    "    .loc[lambda df: df[\"group\"] == \"'over $200'\"]\n",
+    "    [\"date\"]\n",
+    "    .value_counts()\n",
+    "    .sort_index()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2016-02-22        1\n",
+       "2016-05-16    12293\n",
+       "2016-05-17    55852\n",
+       "2016-05-18    33637\n",
+       "Name: date, dtype: int64"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    comments_16_42_classified\n",
+    "    .loc[lambda df: df[\"group\"] == \"American Commitment\"]\n",
+    "    [\"date\"]\n",
+    "    .value_counts()\n",
+    "    .sort_index()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "*Note: The 2016-02-22 comment above appears to stem from a data-entry mistake on the FCC's website. There, [the comment](https://www.fcc.gov/ecfs/filing/60001484317)'s text seems to suggests that the language actually came from a [comment with ID 60001843102](https://www.fcc.gov/ecfs/filing/60001843102); that comment, in turn, says it was received on May 18, 2016.*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Example comments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== 'over $200' ===\n",
+      "\n",
+      "60001650840.txtA cable subscriber pays over $200 per year to rent a box from the cable companies that are already protected by government. This kills competition, limits consumer choice, and lifts up cable profits that are already excessive.Allow the free market to work and unlock the box to open competition and end the monopoly that cable companies have over our televisions.Page 1\n",
+      "\n",
+      "60001633497.txtA cable subscriber pays over $200 per year to rent a box from the cable companies that are already protected by government. This kills competition, limits consumer choice, and lifts up cable profits that are already excessive.Allow the free market to work and unlock the box to open competition and end the monopoly that cable companies have over our televisions.Page 1\n",
+      "\n",
+      "60001621406.txtA cable subscriber pays over $200 per year to rent a box from the cable companies that are already protected by government. This kills competition, limits consumer choice, and lifts up cable profits that are already excessive.Allow the free market to work and unlock the box to open competition and end the monopoly that cable companies have over our televisions.Page 1\n",
+      "\n",
+      "\n",
+      "=== American Commitment ===\n",
+      "\n",
+      "60001870988.txtThe marketplace for video content is thriving and extremely competitive, offering a vast array of video streaming services, cloud-based video on demand, and apps providing news, cinema and programming. This market is rapidly innovating beyond thetraditional set-top box to new applications and devices with more choices than ever. Past government attempts to control set-top boxes have been a complete failure.  Yet another failed attempt at top-down government regulation will only pu[...]\n",
+      "\n",
+      "60001888486.txtThe exchange for video content is booming and incredibly competitive, offering a wide array of video streaming services, cloud-based video on demand, and apps providing news, cinema and programming. This market is swiftly innovating beyond thetraditional set-top box to new applications and devices with more options than ever. Past Commission attempts to control set-top boxes have been a complete failure.  Yet another failed attempt at one-size-fits-all government regulation will o[...]\n",
+      "\n",
+      "60001883996.txtThe exchange for video content is roaring and incredibly competitive, offering a vast array of video streaming services, cloud-based video on demand, and apps providing news, cinema and programming. This market is quickly innovating beyond thetraditional set-top box to new applications and devices with more options than ever. Past FCC attempts to regulate set-top boxes have been a complete failure.  Yet another failed attempt at heavy-handed government regulation will only put the[...]\n",
+      "\n",
+      "\n",
+      "=== [other] ===\n",
+      "\n",
+      "60001976192.txtI oppose unnecessary set-top box regulations that will mean higher bills, fewer choices, and less privacy on TV. The television and video market today is full of great choices, why put such a healthy market at risk with complex and unnecessary new mandates?Page 1\n",
+      "\n",
+      "60001962194.txtI oppose unnecessary set-top box regulations that will mean higher bills, fewer choices, and less privacy on TV. The television and video market today is full of great choices, why put such a healthy market at risk with complex and unnecessary new mandates?Page 1\n",
+      "\n",
+      "60001991447.txtI oppose unnecessary set-top box regulations that will mean higher bills, fewer choices, and less privacy on TV. The television and video market today is full of great choices, why put such a healthy market at risk with complex and unnecessary new mandates?Page 1\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_example_comments(comments_16_42_classified)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compare the above timing and language to the [comment attributed to Annie Reeves](https://www.fcc.gov/ecfs/filing/60001803771), received by the FCC on May 17, 2016:\n",
+    "\n",
+    "> The market for video content is booming and extremely competitive, offering a vast array of video streaming services, cloud-based video on demand, and apps providing news, cinema and programming. This market is swiftly innovating beyond the traditional set-top box to new applications and devices with more options than ever. Past Commission attempts to regulate set-top boxes have been a complete failure. Yet another failed attempt at heavy-handed government regulation will only stifle innovation and benefit companies with political influence rather than companies thatprovide what consumers want. We don't need the federal government to fix what isn'tbroken -- I urge you to reject the proposed rule."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Examine email address crossover between 16-42 and bulk-uploaded 17-108 comments\n",
+    "\n",
+    "Here, we calculate the proportion of commenters from docket 16-42 that later appeared in comments bulk-uploaded to docket 17-108, and observe a very high rate of overlap between the email addresses associated with comments that used American Commitment's language in docket 16-42 and the email addresses listed in comments bulk-uploaded by Media Bridge. We find the same for commenters' full names plus physical addresses."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>num_emails</th>\n",
+       "      <th>email_isin_17_108_nonmb</th>\n",
+       "      <th>email_isin_17_108_mb</th>\n",
+       "      <th>name_and_location_isin_17_108_mb</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>group</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>American Commitment</td>\n",
+       "      <td>100252</td>\n",
+       "      <td>0.0231</td>\n",
+       "      <td>0.9987</td>\n",
+       "      <td>0.9987</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>'over $200'</td>\n",
+       "      <td>100482</td>\n",
+       "      <td>0.0243</td>\n",
+       "      <td>0.0601</td>\n",
+       "      <td>0.0566</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                     num_emails  email_isin_17_108_nonmb  \\\n",
+       "group                                                      \n",
+       "American Commitment      100252                   0.0231   \n",
+       "'over $200'              100482                   0.0243   \n",
+       "\n",
+       "                     email_isin_17_108_mb  name_and_location_isin_17_108_mb  \n",
+       "group                                                                        \n",
+       "American Commitment                0.9987                            0.9987  \n",
+       "'over $200'                        0.0601                            0.0566  "
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    comments_16_42_classified\n",
+    "    [[\n",
+    "        \"email_address\",\n",
+    "        \"name_and_location\",\n",
+    "        \"group\",\n",
+    "    ]]\n",
+    "    .drop_duplicates()\n",
+    "    .dropna()\n",
+    "    .assign(\n",
+    "        email_isin_17_108_nonmb = lambda df: (\n",
+    "            df[\"email_address\"].notnull() & df[\"email_address\"].isin(\n",
+    "                bulk_uploads_17_108\n",
+    "                .loc[lambda df: df[\"uploader\"] != \"shane@mediabridgellc.com\"]\n",
+    "                [\"email_address\"]\n",
+    "            )\n",
+    "        ),\n",
+    "        email_isin_17_108_mb = lambda df: (\n",
+    "            df[\"email_address\"].notnull() & df[\"email_address\"].isin(\n",
+    "                bulk_uploads_17_108\n",
+    "                .loc[lambda df: df[\"uploader\"] == \"shane@mediabridgellc.com\"]\n",
+    "                [\"email_address\"]\n",
+    "            )\n",
+    "        ),\n",
+    "        name_and_location_isin_17_108_mb = lambda df: (\n",
+    "            df[\"name_and_location\"].isin(\n",
+    "                bulk_uploads_17_108\n",
+    "                .loc[lambda df: df[\"uploader\"] == \"shane@mediabridgellc.com\"]\n",
+    "                [\"name_and_location\"]\n",
+    "            )\n",
+    "        ),\n",
+    "    )\n",
+    "    .groupby(\"group\")\n",
+    "    .pipe(lambda grp: pd.DataFrame({\n",
+    "        \"num_emails\": grp.size(),\n",
+    "        \"email_isin_17_108_nonmb\": grp[\"email_isin_17_108_nonmb\"].mean().round(4),\n",
+    "        \"email_isin_17_108_mb\": grp[\"email_isin_17_108_mb\"].mean().round(4),\n",
+    "        \"name_and_location_isin_17_108_mb\": grp[\"name_and_location_isin_17_108_mb\"].mean().round(4),\n",
+    "    }))\n",
+    "    .loc[lambda df: df[\"num_emails\"] >= 1000]\n",
+    "    .sort_values(\"email_isin_17_108_mb\", ascending = False)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analyze 17-108 bulk-uploads vis-a-vis Have I Been Pwned\n",
+    "\n",
+    "In this section, we take random samples of email addresses the comments bulk-uploaded to Docket 17-108,  and calculate the rates at which they have appeared in the data breaches tracked by Have I Been Pwned. We focus on the accounts that uploaded comments containing 10,000+ distinct email addresses.\n",
+    "\n",
+    "*Note: The HIBP data has already been been fetched and saved, but the code used to fetch the data is included here for reference, and for reuse by other researchers.*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    uuid_lookup = pd.read_csv(\n",
+    "        BASE_PATH + \"bulk-uploads-17-108-uuid-lookup.csv\",\n",
+    "        dtype = str,\n",
+    "    )\n",
+    "    \n",
+    "    assert uuid_lookup[\"email_address_uuid\"].value_counts().max() == 1\n",
+    "    print(f\"{len(uuid_lookup):,d}\")\n",
+    "except:\n",
+    "    uuid_lookup = pd.DataFrame(None, columns = [ \"email_address\", \"email_address_uuid\" ])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BASE_HIBP_URL = \"https://haveibeenpwned.com/api/v3/breachedaccount/\"\n",
+    "from json import JSONDecodeError\n",
+    "\n",
+    "def fetch_hipb_results(email_address):\n",
+    "    while True:\n",
+    "        try:\n",
+    "            res = requests.get(\n",
+    "                f\"{BASE_HIBP_URL}{email_address.strip()}\",\n",
+    "                headers = {\n",
+    "                    'hibp-api-key': HIBP_KEY,\n",
+    "                },\n",
+    "            )\n",
+    "            if res.from_cache == False:\n",
+    "                time.sleep(1.5)\n",
+    "\n",
+    "            # Check that JSON is parseable\n",
+    "            if res.content != b\"\":\n",
+    "                res.json()\n",
+    "                if \"message\" in res.json():\n",
+    "                    raise Exception(\"HIPB error: {res.json()['message']}\")\n",
+    "\n",
+    "        except requests.RequestException:\n",
+    "            sys.stderr.write(f\"\\nException; sleeping for 10 seconds\\n\")\n",
+    "            time.sleep(10)            \n",
+    "            continue\n",
+    "            \n",
+    "        except JSONDecodeError as e:\n",
+    "            sys.stderr.write(f\"\\nERROR: <{email_address}>\\n\")\n",
+    "            sys.stderr.write(f\"{e}\\n\")\n",
+    "            sys.stderr.write(f\"{res.content}\\n\")\n",
+    "            return [ { \"email_address\": email_address, \"breach\": \"[error]\" } ]\n",
+    "\n",
+    "        if res.status_code == 429:\n",
+    "            sleep_int = int(res.headers[\"Retry-After\"])\n",
+    "            sys.stderr.write(f\"\\nSleeping for {sleep_int + 1} seconds\")\n",
+    "            time.sleep(sleep_int)\n",
+    "            continue\n",
+    "            \n",
+    "        if res.content == b\"\" or res.status_code == 404:\n",
+    "            return [ { \"email_address\": email_address, \"breach\": \"[none]\" } ]\n",
+    "\n",
+    "        else:\n",
+    "            return [ { \"email_address\": email_address, \"breach\": x[\"Name\"] } for x in res.json() ]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following function creates a sample — or a grouped set of samples — from a given set of comments. Before sampling, the code removes blank email addresses and those with non-standard characters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_sample(df, grouping = [], n = 1000, random_state = RANDOM_STATE):\n",
+    "    clean = (\n",
+    "        df\n",
+    "        .loc[lambda df: df[\"email_address\"].notnull()]\n",
+    "        .loc[lambda df: df[\"email_address_nonstandard\"] == 0]\n",
+    "        .drop_duplicates(subset = [ \"email_address\" ] + grouping)\n",
+    "    )\n",
+    "    \n",
+    "    sampler = lambda df: df.sample(n, random_state = random_state)\n",
+    "    \n",
+    "    if len(grouping):\n",
+    "        return (\n",
+    "            clean\n",
+    "            .groupby(grouping)\n",
+    "            .apply(sampler)\n",
+    "            .reset_index(drop = True)\n",
+    "        )\n",
+    "    else:\n",
+    "        return clean.pipe(sampler)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_breaches(df, save_path, save = True, use_saved = True):\n",
+    "    if use_saved and os.path.exists(save_path):\n",
+    "        return pd.read_csv(save_path, dtype = str)\n",
+    "    else:\n",
+    "        breaches_raw = pd.concat(map(pd.DataFrame, (\n",
+    "            df\n",
+    "            .rename(columns = {\n",
+    "                \"email_address\": \"email_address_uuid\"\n",
+    "            })\n",
+    "            .merge(\n",
+    "                uuid_lookup,\n",
+    "                how = \"left\",\n",
+    "                on = [ \"email_address_uuid\" ]\n",
+    "            )\n",
+    "            [\"email_address\"]\n",
+    "            .progress_apply(fetch_hipb_results)\n",
+    "        ))).drop_duplicates()\n",
+    "        \n",
+    "        breaches = (\n",
+    "            breaches_raw\n",
+    "            .merge(\n",
+    "                uuid_lookup,\n",
+    "                how = \"left\",\n",
+    "                on = [ \"email_address\" ]\n",
+    "            )\n",
+    "            .drop(columns = [ \"email_address\" ])\n",
+    "            .rename(columns = {\n",
+    "                \"email_address_uuid\": \"email_address\",\n",
+    "            })\n",
+    "        )\n",
+    "        \n",
+    "        if save:\n",
+    "            breaches.to_csv(save_path, index = False)\n",
+    "\n",
+    "        return breaches"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The function below calculates the breach rates for groups of sampled comments, for each breach found."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate_breach_rates(sample, breaches):\n",
+    "    return (\n",
+    "        sample\n",
+    "        [[\n",
+    "            \"email_address\",\n",
+    "            \"uploader\",\n",
+    "        ]]\n",
+    "        \n",
+    "        .merge(\n",
+    "            breaches,\n",
+    "            how = \"left\",\n",
+    "            on = [ \"email_address\" ],\n",
+    "        )\n",
+    "        .assign(breached = 1)\n",
+    "        .set_index([\n",
+    "            \"uploader\",\n",
+    "            \"email_address\",\n",
+    "            \"breach\",\n",
+    "        ])\n",
+    "        [\"breached\"]\n",
+    "        .unstack()\n",
+    "        .fillna(0)\n",
+    "        .astype(int)\n",
+    "        # At this point, we have a matrix of uploader+email x breach\n",
+    "        # where the values are 1 if breached and 0 if not\n",
+    "        \n",
+    "        # Now, we group by uploader and calculate the proportion of\n",
+    "        # emails breached\n",
+    "        .groupby([ \"uploader\" ])\n",
+    "        .mean()\n",
+    "        \n",
+    "        # Then we return the data frame to a \"tidy\" format:\n",
+    "        # uploader|breach|rate\n",
+    "        .stack()\n",
+    "        .sort_values(ascending = False)\n",
+    "        .to_frame(\"rate\")\n",
+    "        .reset_index()\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 17-108 by bulk uploader\n",
+    "\n",
+    "Limited here to the accounts that uploaded comments containing 10,000+ distinct email addresses."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>date</th>\n",
+       "      <th>comments</th>\n",
+       "      <th>file</th>\n",
+       "      <th>uploader</th>\n",
+       "      <th>email_address_nonstandard</th>\n",
+       "      <th>email_address</th>\n",
+       "      <th>email_domain</th>\n",
+       "      <th>name_and_location</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>5/15/17</td>\n",
+       "      <td>In 2015, wealthy leftist billionaires and powe...</td>\n",
+       "      <td>FOI-14090-2017527.csv</td>\n",
+       "      <td>esmisc@mac.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>939bfae2-62d1-47de-b009-c2abc6b681f5</td>\n",
+       "      <td>yahoo.com</td>\n",
+       "      <td>8930069a-021b-4263-9c3b-a3923af9a9dc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>8/5/17</td>\n",
+       "      <td>Before leaving office, the Obama Administratio...</td>\n",
+       "      <td>CFIF_1_25000_08052017_4 (1).csv</td>\n",
+       "      <td>esmisc@mac.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>f9a12339-56cb-4540-9adc-fc6238428f49</td>\n",
+       "      <td>gmail.com</td>\n",
+       "      <td>6c65ec31-5135-4500-99c5-bba309b415fb</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>8/6/17</td>\n",
+       "      <td>Before leaving office, the Obama Administratio...</td>\n",
+       "      <td>CFIF_1_25000_08062017_2.csv</td>\n",
+       "      <td>esmisc@mac.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>fcf0991a-0ed7-408b-8e52-4735baccd906</td>\n",
+       "      <td>yahoo.com</td>\n",
+       "      <td>6dfa9546-ad61-404a-bec7-48464be021b4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>7/29/17</td>\n",
+       "      <td>Before leaving office, the Obama Administratio...</td>\n",
+       "      <td>CFIF_1_40000_07292017.csv</td>\n",
+       "      <td>esmisc@mac.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>ee33e2a5-854f-471b-adb1-1ff62d69bf46</td>\n",
+       "      <td>gmail.com</td>\n",
+       "      <td>6d99eb3f-9242-440d-be2a-c7f7ae3b4e91</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>5/9/17</td>\n",
+       "      <td>Obama's Federal Communications Commission (FCC...</td>\n",
+       "      <td>T2017510-2.csv</td>\n",
+       "      <td>esmisc@mac.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4d294840-8365-4d34-a5c4-c09f6b8bc01d</td>\n",
+       "      <td>icloud.com</td>\n",
+       "      <td>d132203a-a146-4043-b097-d6606498309f</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      date                                           comments  \\\n",
+       "0  5/15/17  In 2015, wealthy leftist billionaires and powe...   \n",
+       "1   8/5/17  Before leaving office, the Obama Administratio...   \n",
+       "2   8/6/17  Before leaving office, the Obama Administratio...   \n",
+       "3  7/29/17  Before leaving office, the Obama Administratio...   \n",
+       "4   5/9/17  Obama's Federal Communications Commission (FCC...   \n",
+       "\n",
+       "                              file        uploader  email_address_nonstandard  \\\n",
+       "0            FOI-14090-2017527.csv  esmisc@mac.com                          0   \n",
+       "1  CFIF_1_25000_08052017_4 (1).csv  esmisc@mac.com                          0   \n",
+       "2      CFIF_1_25000_08062017_2.csv  esmisc@mac.com                          0   \n",
+       "3        CFIF_1_40000_07292017.csv  esmisc@mac.com                          0   \n",
+       "4                   T2017510-2.csv  esmisc@mac.com                          0   \n",
+       "\n",
+       "                          email_address email_domain  \\\n",
+       "0  939bfae2-62d1-47de-b009-c2abc6b681f5    yahoo.com   \n",
+       "1  f9a12339-56cb-4540-9adc-fc6238428f49    gmail.com   \n",
+       "2  fcf0991a-0ed7-408b-8e52-4735baccd906    yahoo.com   \n",
+       "3  ee33e2a5-854f-471b-adb1-1ff62d69bf46    gmail.com   \n",
+       "4  4d294840-8365-4d34-a5c4-c09f6b8bc01d   icloud.com   \n",
+       "\n",
+       "                      name_and_location  \n",
+       "0  8930069a-021b-4263-9c3b-a3923af9a9dc  \n",
+       "1  6c65ec31-5135-4500-99c5-bba309b415fb  \n",
+       "2  6dfa9546-ad61-404a-bec7-48464be021b4  \n",
+       "3  6d99eb3f-9242-440d-be2a-c7f7ae3b4e91  \n",
+       "4  d132203a-a146-4043-b097-d6606498309f  "
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample_17_108_bulk_uploads = (\n",
+    "    bulk_uploads_17_108\n",
+    "    .loc[lambda df: df[\"uploader\"].isin(\n",
+    "        uploader_metrics\n",
+    "        .loc[lambda df: df[\"unique_emails\"] >= 10000]\n",
+    "        .index\n",
+    "    )]\n",
+    "    .pipe(\n",
+    "        create_sample,\n",
+    "        grouping = [ \"uploader\" ],\n",
+    "        n = 1000\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "sample_17_108_bulk_uploads.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "esmisc@mac.com                1000\n",
+       "fccfreedom@hmamail.com        1000\n",
+       "karen@momsrising.org          1000\n",
+       "kurt@demandprogress.org       1000\n",
+       "meaghan@mandatemedia.com      1000\n",
+       "mike@fightforthefuture.org    1000\n",
+       "ncatalano@ofa.us              1000\n",
+       "shane@mediabridgellc.com      1000\n",
+       "Name: uploader, dtype: int64"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample_17_108_bulk_uploads[\"uploader\"].value_counts().sort_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>breach</th>\n",
+       "      <th>email_address</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>8tracks</td>\n",
+       "      <td>939bfae2-62d1-47de-b009-c2abc6b681f5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>Animoto</td>\n",
+       "      <td>939bfae2-62d1-47de-b009-c2abc6b681f5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>MindJolt</td>\n",
+       "      <td>939bfae2-62d1-47de-b009-c2abc6b681f5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>ModernBusinessSolutions</td>\n",
+       "      <td>939bfae2-62d1-47de-b009-c2abc6b681f5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>RiverCityMedia</td>\n",
+       "      <td>939bfae2-62d1-47de-b009-c2abc6b681f5</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    breach                         email_address\n",
+       "0                  8tracks  939bfae2-62d1-47de-b009-c2abc6b681f5\n",
+       "1                  Animoto  939bfae2-62d1-47de-b009-c2abc6b681f5\n",
+       "2                 MindJolt  939bfae2-62d1-47de-b009-c2abc6b681f5\n",
+       "3  ModernBusinessSolutions  939bfae2-62d1-47de-b009-c2abc6b681f5\n",
+       "4           RiverCityMedia  939bfae2-62d1-47de-b009-c2abc6b681f5"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "breaches_17_108_bulk_uploads = (\n",
+    "    sample_17_108_bulk_uploads\n",
+    "    .pipe(get_breaches, \"../data/breaches-17-108-bulk-uploads-sample.csv\")\n",
+    ")\n",
+    "\n",
+    "breaches_17_108_bulk_uploads.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Most common breach-uploader combinations:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>uploader</th>\n",
+       "      <th>breach</th>\n",
+       "      <th>rate</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>ModernBusinessSolutions</td>\n",
+       "      <td>0.942</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>RiverCityMedia</td>\n",
+       "      <td>0.807</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>fccfreedom@hmamail.com</td>\n",
+       "      <td>VerificationsIO</td>\n",
+       "      <td>0.782</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>VerificationsIO</td>\n",
+       "      <td>0.743</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>fccfreedom@hmamail.com</td>\n",
+       "      <td>RiverCityMedia</td>\n",
+       "      <td>0.645</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>esmisc@mac.com</td>\n",
+       "      <td>VerificationsIO</td>\n",
+       "      <td>0.625</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>esmisc@mac.com</td>\n",
+       "      <td>RiverCityMedia</td>\n",
+       "      <td>0.565</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>fccfreedom@hmamail.com</td>\n",
+       "      <td>ModernBusinessSolutions</td>\n",
+       "      <td>0.466</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>8</td>\n",
+       "      <td>ncatalano@ofa.us</td>\n",
+       "      <td>VerificationsIO</td>\n",
+       "      <td>0.463</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>9</td>\n",
+       "      <td>karen@momsrising.org</td>\n",
+       "      <td>VerificationsIO</td>\n",
+       "      <td>0.459</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>10</td>\n",
+       "      <td>mike@fightforthefuture.org</td>\n",
+       "      <td>VerificationsIO</td>\n",
+       "      <td>0.435</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>11</td>\n",
+       "      <td>meaghan@mandatemedia.com</td>\n",
+       "      <td>VerificationsIO</td>\n",
+       "      <td>0.435</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>12</td>\n",
+       "      <td>kurt@demandprogress.org</td>\n",
+       "      <td>VerificationsIO</td>\n",
+       "      <td>0.412</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>13</td>\n",
+       "      <td>karen@momsrising.org</td>\n",
+       "      <td>RiverCityMedia</td>\n",
+       "      <td>0.377</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>14</td>\n",
+       "      <td>meaghan@mandatemedia.com</td>\n",
+       "      <td>RiverCityMedia</td>\n",
+       "      <td>0.364</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>15</td>\n",
+       "      <td>ncatalano@ofa.us</td>\n",
+       "      <td>RiverCityMedia</td>\n",
+       "      <td>0.345</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>16</td>\n",
+       "      <td>esmisc@mac.com</td>\n",
+       "      <td>ModernBusinessSolutions</td>\n",
+       "      <td>0.345</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>17</td>\n",
+       "      <td>mike@fightforthefuture.org</td>\n",
+       "      <td>RiverCityMedia</td>\n",
+       "      <td>0.344</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>18</td>\n",
+       "      <td>ncatalano@ofa.us</td>\n",
+       "      <td>LinkedIn</td>\n",
+       "      <td>0.339</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>19</td>\n",
+       "      <td>kurt@demandprogress.org</td>\n",
+       "      <td>RiverCityMedia</td>\n",
+       "      <td>0.323</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                      uploader                   breach   rate\n",
+       "0     shane@mediabridgellc.com  ModernBusinessSolutions  0.942\n",
+       "1     shane@mediabridgellc.com           RiverCityMedia  0.807\n",
+       "2       fccfreedom@hmamail.com          VerificationsIO  0.782\n",
+       "3     shane@mediabridgellc.com          VerificationsIO  0.743\n",
+       "4       fccfreedom@hmamail.com           RiverCityMedia  0.645\n",
+       "5               esmisc@mac.com          VerificationsIO  0.625\n",
+       "6               esmisc@mac.com           RiverCityMedia  0.565\n",
+       "7       fccfreedom@hmamail.com  ModernBusinessSolutions  0.466\n",
+       "8             ncatalano@ofa.us          VerificationsIO  0.463\n",
+       "9         karen@momsrising.org          VerificationsIO  0.459\n",
+       "10  mike@fightforthefuture.org          VerificationsIO  0.435\n",
+       "11    meaghan@mandatemedia.com          VerificationsIO  0.435\n",
+       "12     kurt@demandprogress.org          VerificationsIO  0.412\n",
+       "13        karen@momsrising.org           RiverCityMedia  0.377\n",
+       "14    meaghan@mandatemedia.com           RiverCityMedia  0.364\n",
+       "15            ncatalano@ofa.us           RiverCityMedia  0.345\n",
+       "16              esmisc@mac.com  ModernBusinessSolutions  0.345\n",
+       "17  mike@fightforthefuture.org           RiverCityMedia  0.344\n",
+       "18            ncatalano@ofa.us                 LinkedIn  0.339\n",
+       "19     kurt@demandprogress.org           RiverCityMedia  0.323"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    calculate_breach_rates(\n",
+    "        sample_17_108_bulk_uploads,\n",
+    "        breaches_17_108_bulk_uploads,\n",
+    "    )\n",
+    "    .head(20)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Modern Business Solutions breaches only:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>uploader</th>\n",
+       "      <th>breach</th>\n",
+       "      <th>rate</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>ModernBusinessSolutions</td>\n",
+       "      <td>0.942</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>fccfreedom@hmamail.com</td>\n",
+       "      <td>ModernBusinessSolutions</td>\n",
+       "      <td>0.466</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>16</td>\n",
+       "      <td>esmisc@mac.com</td>\n",
+       "      <td>ModernBusinessSolutions</td>\n",
+       "      <td>0.345</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>116</td>\n",
+       "      <td>meaghan@mandatemedia.com</td>\n",
+       "      <td>ModernBusinessSolutions</td>\n",
+       "      <td>0.114</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>121</td>\n",
+       "      <td>ncatalano@ofa.us</td>\n",
+       "      <td>ModernBusinessSolutions</td>\n",
+       "      <td>0.106</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>130</td>\n",
+       "      <td>karen@momsrising.org</td>\n",
+       "      <td>ModernBusinessSolutions</td>\n",
+       "      <td>0.099</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>145</td>\n",
+       "      <td>kurt@demandprogress.org</td>\n",
+       "      <td>ModernBusinessSolutions</td>\n",
+       "      <td>0.087</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>150</td>\n",
+       "      <td>mike@fightforthefuture.org</td>\n",
+       "      <td>ModernBusinessSolutions</td>\n",
+       "      <td>0.086</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                       uploader                   breach   rate\n",
+       "0      shane@mediabridgellc.com  ModernBusinessSolutions  0.942\n",
+       "7        fccfreedom@hmamail.com  ModernBusinessSolutions  0.466\n",
+       "16               esmisc@mac.com  ModernBusinessSolutions  0.345\n",
+       "116    meaghan@mandatemedia.com  ModernBusinessSolutions  0.114\n",
+       "121            ncatalano@ofa.us  ModernBusinessSolutions  0.106\n",
+       "130        karen@momsrising.org  ModernBusinessSolutions  0.099\n",
+       "145     kurt@demandprogress.org  ModernBusinessSolutions  0.087\n",
+       "150  mike@fightforthefuture.org  ModernBusinessSolutions  0.086"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    calculate_breach_rates(\n",
+    "        sample_17_108_bulk_uploads,\n",
+    "        breaches_17_108_bulk_uploads,\n",
+    "    )\n",
+    "    .loc[lambda df: df[\"breach\"] == \"ModernBusinessSolutions\"]    \n",
+    "    .head(20)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Larger 17-108 Media Bridge sample (10,000 addresses), for more precise rates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>date</th>\n",
+       "      <th>comments</th>\n",
+       "      <th>file</th>\n",
+       "      <th>uploader</th>\n",
+       "      <th>email_address_nonstandard</th>\n",
+       "      <th>email_address</th>\n",
+       "      <th>email_domain</th>\n",
+       "      <th>name_and_location</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>3236533</td>\n",
+       "      <td>5/14/17</td>\n",
+       "      <td>Dear Chairman Pai,  I am concerned about Inter...</td>\n",
+       "      <td>Batch-A4.csv</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>44a8867c-3332-403f-9b34-560c054bd728</td>\n",
+       "      <td>gmail.com</td>\n",
+       "      <td>058cbe31-9c92-4509-a8d9-50f84a1cf1ae</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2723370</td>\n",
+       "      <td>5/15/17</td>\n",
+       "      <td>Dear Mr. Pai,  Regarding the Obama takeover of...</td>\n",
+       "      <td>Batch-A2.csv</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>f9ad6e74-115c-4ccd-8a4e-f1e408423942</td>\n",
+       "      <td>icloud.com</td>\n",
+       "      <td>99c87532-c65e-42fa-97bd-0438d3ff504c</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>56337</td>\n",
+       "      <td>5/14/17</td>\n",
+       "      <td>Chairman Pai:  Hi, I'd like to comment on Titl...</td>\n",
+       "      <td>file-i.csv</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>ad374006-bbed-4c8b-932d-e7dfacce1a29</td>\n",
+       "      <td>aol.com</td>\n",
+       "      <td>6752ca9d-c848-4f3d-a66c-49073afe2458</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>592618</td>\n",
+       "      <td>05/16/2017</td>\n",
+       "      <td>The Title II order created a gaping gap in pri...</td>\n",
+       "      <td>batch-d-4.csv</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>c16e61f3-bd1f-4e50-95dd-830f5a219543</td>\n",
+       "      <td>gmail.com</td>\n",
+       "      <td>ebccbc49-fa14-404f-bf83-b0ef00d48e78</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1014169</td>\n",
+       "      <td>05/15/2017</td>\n",
+       "      <td>Dear Chairman Pai,  I'm very worried about Net...</td>\n",
+       "      <td>batch-b-5.csv</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>0</td>\n",
+       "      <td>b252ca16-b5a2-4c61-9034-6e365bec0beb</td>\n",
+       "      <td>gmail.com</td>\n",
+       "      <td>bc102bc2-d454-44a4-974d-d0b1a377f392</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               date                                           comments  \\\n",
+       "3236533     5/14/17  Dear Chairman Pai,  I am concerned about Inter...   \n",
+       "2723370     5/15/17  Dear Mr. Pai,  Regarding the Obama takeover of...   \n",
+       "56337       5/14/17  Chairman Pai:  Hi, I'd like to comment on Titl...   \n",
+       "592618   05/16/2017  The Title II order created a gaping gap in pri...   \n",
+       "1014169  05/15/2017  Dear Chairman Pai,  I'm very worried about Net...   \n",
+       "\n",
+       "                  file                  uploader  email_address_nonstandard  \\\n",
+       "3236533   Batch-A4.csv  shane@mediabridgellc.com                          0   \n",
+       "2723370   Batch-A2.csv  shane@mediabridgellc.com                          0   \n",
+       "56337       file-i.csv  shane@mediabridgellc.com                          0   \n",
+       "592618   batch-d-4.csv  shane@mediabridgellc.com                          0   \n",
+       "1014169  batch-b-5.csv  shane@mediabridgellc.com                          0   \n",
+       "\n",
+       "                                email_address email_domain  \\\n",
+       "3236533  44a8867c-3332-403f-9b34-560c054bd728    gmail.com   \n",
+       "2723370  f9ad6e74-115c-4ccd-8a4e-f1e408423942   icloud.com   \n",
+       "56337    ad374006-bbed-4c8b-932d-e7dfacce1a29      aol.com   \n",
+       "592618   c16e61f3-bd1f-4e50-95dd-830f5a219543    gmail.com   \n",
+       "1014169  b252ca16-b5a2-4c61-9034-6e365bec0beb    gmail.com   \n",
+       "\n",
+       "                            name_and_location  \n",
+       "3236533  058cbe31-9c92-4509-a8d9-50f84a1cf1ae  \n",
+       "2723370  99c87532-c65e-42fa-97bd-0438d3ff504c  \n",
+       "56337    6752ca9d-c848-4f3d-a66c-49073afe2458  \n",
+       "592618   ebccbc49-fa14-404f-bf83-b0ef00d48e78  \n",
+       "1014169  bc102bc2-d454-44a4-974d-d0b1a377f392  "
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample_17_108_mb = (\n",
+    "    bulk_uploads_17_108\n",
+    "    .loc[lambda df: df[\"uploader\"] == \"shane@mediabridgellc.com\"]\n",
+    "    .pipe(\n",
+    "        create_sample,\n",
+    "        n = 10000,\n",
+    "        random_state = RANDOM_STATE + 1,  # +1 so that we have an independent sample    \n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "sample_17_108_mb.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>breach</th>\n",
+       "      <th>email_address</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>Edmodo</td>\n",
+       "      <td>44a8867c-3332-403f-9b34-560c054bd728</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>ModernBusinessSolutions</td>\n",
+       "      <td>44a8867c-3332-403f-9b34-560c054bd728</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>RiverCityMedia</td>\n",
+       "      <td>44a8867c-3332-403f-9b34-560c054bd728</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>SpecialKSpamList</td>\n",
+       "      <td>44a8867c-3332-403f-9b34-560c054bd728</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>VerificationsIO</td>\n",
+       "      <td>44a8867c-3332-403f-9b34-560c054bd728</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    breach                         email_address\n",
+       "0                   Edmodo  44a8867c-3332-403f-9b34-560c054bd728\n",
+       "1  ModernBusinessSolutions  44a8867c-3332-403f-9b34-560c054bd728\n",
+       "2           RiverCityMedia  44a8867c-3332-403f-9b34-560c054bd728\n",
+       "3         SpecialKSpamList  44a8867c-3332-403f-9b34-560c054bd728\n",
+       "4          VerificationsIO  44a8867c-3332-403f-9b34-560c054bd728"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "breaches_17_108_mb = (\n",
+    "    sample_17_108_mb\n",
+    "    .pipe(get_breaches, \"../data/breaches-17-108-mb-sample.csv\")\n",
+    ")\n",
+    "\n",
+    "breaches_17_108_mb.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>uploader</th>\n",
+       "      <th>breach</th>\n",
+       "      <th>rate</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>ModernBusinessSolutions</td>\n",
+       "      <td>0.9388</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>RiverCityMedia</td>\n",
+       "      <td>0.8277</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>VerificationsIO</td>\n",
+       "      <td>0.7651</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>Collection1</td>\n",
+       "      <td>0.2574</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>Exactis</td>\n",
+       "      <td>0.2571</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>MySpace</td>\n",
+       "      <td>0.1968</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>AntiPublic</td>\n",
+       "      <td>0.1956</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>SpecialKSpamList</td>\n",
+       "      <td>0.1946</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>8</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>OnlinerSpambot</td>\n",
+       "      <td>0.1941</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>9</td>\n",
+       "      <td>shane@mediabridgellc.com</td>\n",
+       "      <td>ExploitIn</td>\n",
+       "      <td>0.1826</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                   uploader                   breach    rate\n",
+       "0  shane@mediabridgellc.com  ModernBusinessSolutions  0.9388\n",
+       "1  shane@mediabridgellc.com           RiverCityMedia  0.8277\n",
+       "2  shane@mediabridgellc.com          VerificationsIO  0.7651\n",
+       "3  shane@mediabridgellc.com              Collection1  0.2574\n",
+       "4  shane@mediabridgellc.com                  Exactis  0.2571\n",
+       "5  shane@mediabridgellc.com                  MySpace  0.1968\n",
+       "6  shane@mediabridgellc.com               AntiPublic  0.1956\n",
+       "7  shane@mediabridgellc.com         SpecialKSpamList  0.1946\n",
+       "8  shane@mediabridgellc.com           OnlinerSpambot  0.1941\n",
+       "9  shane@mediabridgellc.com                ExploitIn  0.1826"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    calculate_breach_rates(\n",
+    "        sample_17_108_mb,\n",
+    "        breaches_17_108_mb,\n",
+    "    )\n",
+    "    .head(10)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Comparing MBS breach status to Docket 16-42 overlap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>isin_mbs</th>\n",
+       "      <th>isin_16</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>3236533</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2723370</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>56337</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>592618</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1014169</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         isin_mbs  isin_16\n",
+       "3236533      True    False\n",
+       "2723370      True    False\n",
+       "56337        True    False\n",
+       "592618       True    False\n",
+       "1014169      True    False"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample_17_108_mb_comparison = (\n",
+    "    sample_17_108_mb    \n",
+    "    .assign(\n",
+    "        isin_mbs = lambda df: (\n",
+    "            df\n",
+    "            [\"email_address\"].isin(\n",
+    "                breaches_17_108_mb\n",
+    "                .loc[lambda df: df[\"breach\"] == \"ModernBusinessSolutions\"]\n",
+    "                [\"email_address\"]\n",
+    "            )\n",
+    "        ),\n",
+    "        isin_16 = lambda df: (\n",
+    "            df\n",
+    "            [\"name_and_location\"]\n",
+    "            .isin(\n",
+    "                comments_16_42_classified\n",
+    "                .loc[lambda df: df[\"group\"] == \"American Commitment\"]\n",
+    "                [\"name_and_location\"]\n",
+    "            )\n",
+    "        )\n",
+    "    )\n",
+    "    [[\n",
+    "        \"isin_mbs\",\n",
+    "        \"isin_16\",\n",
+    "    ]]\n",
+    ")\n",
+    "\n",
+    "sample_17_108_mb_comparison.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Matrix of the 10,000-comment sample, by whether the email address exists in the Modern Business Solutions breach and whether the exact contact information shows up in the Docket 16-42 comments that used American Commitment's language:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>isin_16</th>\n",
+       "      <th>False</th>\n",
+       "      <th>True</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>isin_mbs</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>False</td>\n",
+       "      <td>35</td>\n",
+       "      <td>577</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>True</td>\n",
+       "      <td>9287</td>\n",
+       "      <td>101</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "isin_16   False  True \n",
+       "isin_mbs              \n",
+       "False        35    577\n",
+       "True       9287    101"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    sample_17_108_mb_comparison\n",
+    "    .groupby([\n",
+    "        \"isin_mbs\",\n",
+    "        \"isin_16\",\n",
+    "    ])\n",
+    "    .size()\n",
+    "    .unstack()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Among comments whose email addresses do *not* appear in MBS, this is the proportion that use exactly the same contact information as in the Docket 16-42 comments using American Commitment's language:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9428"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    sample_17_108_mb_comparison\n",
+    "    .loc[lambda df: df[\"isin_mbs\"] == False]\n",
+    "    [\"isin_16\"]\n",
+    "    .mean()\n",
+    "    .round(4)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Among comments whose email addresses use exactly the same contact information as in the Docket 16-42 comments using American Commitment's language, this is the proportion of email addresses that appear in MBS:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.149"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    sample_17_108_mb_comparison\n",
+    "    .loc[lambda df: df[\"isin_16\"] == True]\n",
+    "    [\"isin_mbs\"]\n",
+    "    .mean()\n",
+    "    .round(4)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is the proportion of comments that *either* are attributed to email addresses that appear in the Modern Business Solutions breach *or* use exactly the same contact information as in the Docket 16-42 comments using American Commitment's language:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9965"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    sample_17_108_mb_comparison\n",
+    "    [[\n",
+    "        \"isin_mbs\",\n",
+    "        \"isin_16\",\n",
+    "    ]]\n",
+    "    .sum(axis = 1)\n",
+    "    .pipe(lambda x: x > 0)\n",
+    "    .mean()\n",
+    "    .round(4)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "---\n",
+    "\n",
+    "---"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {
+     "04f19153d45345bea122f0226fd113c0": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HBoxModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HBoxModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HBoxView",
+       "box_style": "",
+       "children": [
+        "IPY_MODEL_838c964bbd7f4c8d9f2c6c74101b7910",
+        "IPY_MODEL_559e9ca4781549da93712e2012813f5e"
+       ],
+       "layout": "IPY_MODEL_337a0b4574d94a47ab32d58f8a7a5c61"
+      }
+     },
+     "0b009166dfde4b37950925f2a90cb56f": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "IntProgressModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "IntProgressModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "ProgressView",
+       "bar_style": "success",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_57f0be729dfb476c82ef0deb5203147f",
+       "max": 10000,
+       "min": 0,
+       "orientation": "horizontal",
+       "style": "IPY_MODEL_0d53cbe8cfc742329b594e489cce0177",
+       "value": 10000
+      }
+     },
+     "0d53cbe8cfc742329b594e489cce0177": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "ProgressStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "ProgressStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "bar_color": null,
+       "description_width": ""
+      }
+     },
+     "0d59382dc9a741a0877f290ffae90364": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "ProgressStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "ProgressStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "bar_color": null,
+       "description_width": ""
+      }
+     },
+     "23b58dd86771493a96c5a267be77f946": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "285330ed3c324f5d912879140bfa7e4e": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HBoxModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HBoxModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HBoxView",
+       "box_style": "",
+       "children": [
+        "IPY_MODEL_8468b9602d9f45eb8fac9eafaf2cc596",
+        "IPY_MODEL_bdf6f69aeeb145dd8ce850b2624b1ca7"
+       ],
+       "layout": "IPY_MODEL_6af89b4e9c64425593860aecdde16787"
+      }
+     },
+     "2a49898891a14de69657f79c88ec06a0": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HBoxModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HBoxModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HBoxView",
+       "box_style": "",
+       "children": [
+        "IPY_MODEL_31d73c99639d4336a842a3711be1de7b",
+        "IPY_MODEL_bc97f74081834de09453bb2fbc424e53"
+       ],
+       "layout": "IPY_MODEL_2e816c2f00c241599adbc09f90286f2f"
+      }
+     },
+     "2cf0ec36c28146b7b7911d8115fd9c08": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "2e816c2f00c241599adbc09f90286f2f": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "31d73c99639d4336a842a3711be1de7b": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "IntProgressModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "IntProgressModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "ProgressView",
+       "bar_style": "success",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_afa3bc7dc3ee45758b0d9edbae514f5a",
+       "max": 8000,
+       "min": 0,
+       "orientation": "horizontal",
+       "style": "IPY_MODEL_96183dd411c941fe95ee9fdf27b53e34",
+       "value": 8000
+      }
+     },
+     "337a0b4574d94a47ab32d58f8a7a5c61": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "3f112798212e438c835992489e901c0c": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_23b58dd86771493a96c5a267be77f946",
+       "placeholder": "​",
+       "style": "IPY_MODEL_44767ed751bf416e972ed642a3a5244c",
+       "value": "100% 10000/10000 [5:22:12&lt;00:00,  1.93s/it]"
+      }
+     },
+     "44767ed751bf416e972ed642a3a5244c": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "DescriptionStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "description_width": ""
+      }
+     },
+     "559e9ca4781549da93712e2012813f5e": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_87ae1a7a3c9b48f79f7a1cb1652f613c",
+       "placeholder": "​",
+       "style": "IPY_MODEL_c2a3af8a55c14d5fac09fd5d3670b9d6",
+       "value": "100% 2000/2000 [00:11&lt;00:00, 169.13it/s]"
+      }
+     },
+     "57f0be729dfb476c82ef0deb5203147f": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "6af89b4e9c64425593860aecdde16787": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "77429d4856194b3083b80bf4479caea2": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "7c6c63382e934b509d42b2f9c6f14540": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "DescriptionStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "description_width": ""
+      }
+     },
+     "805573d19962489a9ae73b224640c32d": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "838c964bbd7f4c8d9f2c6c74101b7910": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "IntProgressModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "IntProgressModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "ProgressView",
+       "bar_style": "success",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_edc354a8cb6b4c93ab80301bbbebffa6",
+       "max": 2000,
+       "min": 0,
+       "orientation": "horizontal",
+       "style": "IPY_MODEL_bf299a0f514048aa9b25df89637fa3bb",
+       "value": 2000
+      }
+     },
+     "8468b9602d9f45eb8fac9eafaf2cc596": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "IntProgressModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "IntProgressModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "ProgressView",
+       "bar_style": "success",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_9163fc6bd9c3420bb8bc911c78eb317c",
+       "max": 10000,
+       "min": 0,
+       "orientation": "horizontal",
+       "style": "IPY_MODEL_e8c99f52be4b41e2b9b0421d46941707",
+       "value": 10000
+      }
+     },
+     "87ae1a7a3c9b48f79f7a1cb1652f613c": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "8cb5cf5fa780495ca62e9dd284a39539": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "9163fc6bd9c3420bb8bc911c78eb317c": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "96183dd411c941fe95ee9fdf27b53e34": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "ProgressStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "ProgressStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "bar_color": null,
+       "description_width": ""
+      }
+     },
+     "af021a898708423b88f8418f69fad55e": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HBoxModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HBoxModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HBoxView",
+       "box_style": "",
+       "children": [
+        "IPY_MODEL_0b009166dfde4b37950925f2a90cb56f",
+        "IPY_MODEL_3f112798212e438c835992489e901c0c"
+       ],
+       "layout": "IPY_MODEL_fd06e30d7c514c66a23889f9cee4c7f6"
+      }
+     },
+     "afa3bc7dc3ee45758b0d9edbae514f5a": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "b39529fe3149414a8643320754f56f5f": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HBoxModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HBoxModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HBoxView",
+       "box_style": "",
+       "children": [
+        "IPY_MODEL_b7bd33b176c04f8fb4dc339a4d7ba001",
+        "IPY_MODEL_ea365edc947e4ffaafe4ade2c060d3a8"
+       ],
+       "layout": "IPY_MODEL_d44549eeeb85491fb80605c67ad820a4"
+      }
+     },
+     "b7bd33b176c04f8fb4dc339a4d7ba001": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "IntProgressModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "IntProgressModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "ProgressView",
+       "bar_style": "success",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_77429d4856194b3083b80bf4479caea2",
+       "max": 10000,
+       "min": 0,
+       "orientation": "horizontal",
+       "style": "IPY_MODEL_0d59382dc9a741a0877f290ffae90364",
+       "value": 10000
+      }
+     },
+     "bc97f74081834de09453bb2fbc424e53": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_2cf0ec36c28146b7b7911d8115fd9c08",
+       "placeholder": "​",
+       "style": "IPY_MODEL_7c6c63382e934b509d42b2f9c6f14540",
+       "value": "100% 8000/8000 [00:48&lt;00:00, 163.55it/s]"
+      }
+     },
+     "bdf6f69aeeb145dd8ce850b2624b1ca7": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_805573d19962489a9ae73b224640c32d",
+       "placeholder": "​",
+       "style": "IPY_MODEL_f4dc764a214645daad6e3a8bc0ca5db3",
+       "value": "100% 10000/10000 [00:58&lt;00:00, 169.89it/s]"
+      }
+     },
+     "bf299a0f514048aa9b25df89637fa3bb": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "ProgressStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "ProgressStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "bar_color": null,
+       "description_width": ""
+      }
+     },
+     "c2a3af8a55c14d5fac09fd5d3670b9d6": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "DescriptionStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "description_width": ""
+      }
+     },
+     "d44549eeeb85491fb80605c67ad820a4": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "e8c99f52be4b41e2b9b0421d46941707": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "ProgressStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "ProgressStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "bar_color": null,
+       "description_width": ""
+      }
+     },
+     "ea365edc947e4ffaafe4ade2c060d3a8": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_8cb5cf5fa780495ca62e9dd284a39539",
+       "placeholder": "​",
+       "style": "IPY_MODEL_eda854ec06cc44fea21cab1c9b6e0e16",
+       "value": "100% 10000/10000 [00:59&lt;00:00, 168.73it/s]"
+      }
+     },
+     "eda854ec06cc44fea21cab1c9b6e0e16": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "DescriptionStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "description_width": ""
+      }
+     },
+     "edc354a8cb6b4c93ab80301bbbebffa6": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "f4dc764a214645daad6e3a8bc0ca5db3": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "DescriptionStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "description_width": ""
+      }
+     },
+     "fd06e30d7c514c66a23889f9cee4c7f6": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     }
+    },
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/analyze-mb-comment-structure.ipynb b/notebooks/analyze-mb-comment-structure.ipynb
new file mode 100644
index 0000000..99aaf69
--- /dev/null
+++ b/notebooks/analyze-mb-comment-structure.ipynb
@@ -0,0 +1,2267 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analyzing the structure of Media Bridge–submitted comments\n",
+    "\n",
+    "This notebook analyzes the comments uploaded by Media Bridge to FCC Docket 17-108, with a focus on understanding the structure behind the algorithmically-generated ones."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load the comments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import re\n",
+    "import json\n",
+    "import math\n",
+    "from functools import reduce"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Media Bridge uploaded 1.9 million comments in total:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1856553"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mb_comments = (\n",
+    "    pd.read_csv(\n",
+    "        \"../data/bulk-uploads-17-108-with-uuids.csv\",\n",
+    "        usecols = [ \"uploader\", \"comments\", \"email_address\" ],\n",
+    "        dtype = str,\n",
+    "    )\n",
+    "    .loc[lambda df: df[\"uploader\"] == \"shane@mediabridgellc.com\"]\n",
+    "    .assign(\n",
+    "        comments = lambda df: df[\"comments\"].str.replace(u\"\\xa0\", \" \")\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "len(mb_comments)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Some, however, are duplicates. There are 1.5 million unique comments, where uniqueness is defined as the combination of the comment text and the email address associated with the comment:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1501759"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mb_deduped = (\n",
+    "    mb_comments\n",
+    "    .drop_duplicates(subset = [ \"comments\", \"email_address\" ])\n",
+    ")\n",
+    "\n",
+    "len(mb_deduped)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Separate randomized vs. non-randomized comments\n",
+    "\n",
+    "About 472,000 of the comments have no internal randomization; they come from one of five pre-written variations. (One of those five has two sub-variations that differ only in formattng; as a result, there are six strings listed below.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "non_randomized = [\n",
+    "    \"The Title II order created a gaping gap in privacy protections by taking the best cop, the FTC, off the beat. That is reason enough to support Chairman Pai's proposal to restore Internet freedom. Restore privacy by repealing Net Neutrality.\",\n",
+    "    \"Title II is a Depression-era regulatory framework designed for a telephone monopoly that no longer exists. It was wrong to apply it to the Internet and the FCC should repeal it and go back to the free-market approach that worked so well.\",\n",
+    "    \"The free-market Internet was an incredible engine of economic growth, innovation, and job creation since the 1990s and has already been substantially slowed by the 2015 Net Neutrality rules. The slowdown in investment is destroying jobs and risks a big future tax hike to make up for lost private investment. Save American jobs by repealing Net Neutrality.\",\n",
+    "    \"The FCC's Net Neutrality rules were written in the Obama White House by political staff and Tech Industry special interests who overruled the FCC's own experts. The FCC's own chief economist Tim Brennan called the rules \\\"an economics-free zone.\\\" They should be repealed.\",\n",
+    "    \"Obama's Net Neutrality order was the corrupt result of a corrupt process controlled by Silicon Valley special interests. It gives some of the biggest companies in the world a free ride at the expense of consumers and should be immediately repealed!\",\n",
+    "    ' \"Obama\\'s Net Neutrality order was the corrupt result of a corrupt process controlled by Silicon Valley special interests. It gives some of the biggest companies in the world a free ride at the expense of consumers and should be immediately repealed!\"',\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "471677"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mb_deduped_nonrandom = (\n",
+    "    mb_deduped\n",
+    "    .loc[lambda df: df[\"comments\"].isin(non_randomized)]\n",
+    ")\n",
+    "\n",
+    "len(mb_deduped_nonrandom)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>Title II is a Depression-era regulatory framework designed for a telephone monopoly that no longer exists. It was wrong to apply it to the Internet and the FCC should repeal it and go back to the free-market approach that worked so well.</td>\n",
+       "      <td>127501</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>The Title II order created a gaping gap in privacy protections by taking the best cop, the FTC, off the beat. That is reason enough to support Chairman Pai's proposal to restore Internet freedom. Restore privacy by repealing Net Neutrality.</td>\n",
+       "      <td>92884</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>The free-market Internet was an incredible engine of economic growth, innovation, and job creation since the 1990s and has already been substantially slowed by the 2015 Net Neutrality rules. The slowdown in investment is destroying jobs and risks a big future tax hike to make up for lost private investment. Save American jobs by repealing Net Neutrality.</td>\n",
+       "      <td>83072</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>Obama's Net Neutrality order was the corrupt result of a corrupt process controlled by Silicon Valley special interests. It gives some of the biggest companies in the world a free ride at the expense of consumers and should be immediately repealed!</td>\n",
+       "      <td>74809</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>The FCC's Net Neutrality rules were written in the Obama White House by political staff and Tech Industry special interests who overruled the FCC's own experts. The FCC's own chief economist Tim Brennan called the rules \"an economics-free zone.\" They should be repealed.</td>\n",
+       "      <td>62635</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>\"Obama's Net Neutrality order was the corrupt result of a corrupt process controlled by Silicon Valley special interests. It gives some of the biggest companies in the world a free ride at the expense of consumers and should be immediately repealed!\"</td>\n",
+       "      <td>30776</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                     count\n",
+       "Title II is a Depression-era regulatory framewo...  127501\n",
+       "The Title II order created a gaping gap in priv...   92884\n",
+       "The free-market Internet was an incredible engi...   83072\n",
+       "Obama's Net Neutrality order was the corrupt re...   74809\n",
+       "The FCC's Net Neutrality rules were written in ...   62635\n",
+       " \"Obama's Net Neutrality order was the corrupt ...   30776"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    mb_deduped_nonrandom\n",
+    "    [\"comments\"]\n",
+    "    .value_counts()\n",
+    "    .to_frame(\"count\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The remaining 1 million comments are, at least on their surface, unique: No two are exactly the same."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1030082"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mb_deduped_random = (\n",
+    "    mb_deduped\n",
+    "    .loc[lambda df: ~df[\"comments\"].isin(non_randomized)]\n",
+    ")\n",
+    "\n",
+    "len(mb_deduped_random)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# If two or more comments were the same, this cell would throw an error\n",
+    "assert mb_deduped_random[\"comments\"].value_counts().max() == 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Examples:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dear Chairman Pai,  I would like to comment on Internet regulation. I strongly recommend Chairman Pai to repeal Obama's scheme to regulate the web. Americans, as opposed to Washington bureaucrats, should purchase the products they prefer. Obama's scheme to regulate the web is a betrayal of the open Internet. It stopped a free-market system that functioned supremely well for decades with broad bipartisan backing.\n",
+      "\n",
+      "To the Federal Communications Commission:  I'm concerned about network neutrality regulations. I'd like to request the government to undo The previous administration's order to control the web. Individual citizens, not the FCC, should enjoy whatever products they desire. The previous administration's order to control the web is a exploitation  of net neutrality. It broke a market-based framework that functioned remarkably smoothly for many years with nearly universal backing.\n",
+      "\n",
+      "Chairman Pai:  My comments re: regulations on the Internet. I'd like to suggest Ajit Pai to rescind Obama's scheme to take over the Internet. Internet users, rather than the FCC, should be free to purchase the products they choose. Obama's scheme to take over the Internet is a corruption of the open Internet. It stopped a free-market system that functioned very, very smoothly for decades with both parties' approval.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"\\n\\n\".join(\n",
+    "    mb_deduped_random\n",
+    "    [\"comments\"]\n",
+    "    .sample(3, random_state = 0)\n",
+    "))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Reverse-engineer the structure of the randomized comments\n",
+    "\n",
+    "The following code represents BuzzFeed News' best estimate of how the randomized comments were generated.\n",
+    "\n",
+    "Each sub-list contains the possible variations, which appear to be selected (with equal weighting) at random. Sub-lists with only one item are \"fixed\"; they don't change from comment to comment.\n",
+    "\n",
+    "One exception is a repeated phrase at the beginning of the fourth sentence of each comment; it repeats whatever happens to have been randomly selected in a particular part of the second sentence. More details on that below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "segments = [\n",
+    "    [\n",
+    "        \"To whom it may concern:  \",\n",
+    "        \"To the Federal Communications Commission:  \",\n",
+    "        \"FCC:  \",\n",
+    "        \"To the FCC:  \",\n",
+    "        \"Dear Commissioners:  \",\n",
+    "        \"Dear Mr. Pai,  \",\n",
+    "        \"Dear Chairman Pai,  \",\n",
+    "        \"Dear FCC,  \",\n",
+    "        \"Mr Pai:  \",\n",
+    "        \"FCC commissioners,  \",\n",
+    "        \"Chairman Pai:  \",\n",
+    "        \"\",\n",
+    "    ],\n",
+    "\n",
+    "    [\n",
+    "        \"I'm concerned about\",\n",
+    "        \"I am concerned about\",\n",
+    "        \"I have concerns about\",\n",
+    "        \"I'm very concerned about\",\n",
+    "        \"I'd like to share my thoughts on\",\n",
+    "        \"Hi, I'd like to comment on\",\n",
+    "        \"I would like to comment on\",\n",
+    "        \"I want to give my opinion on\",\n",
+    "        \"I have thoughts on\",\n",
+    "        \"I'm contacting you about\",\n",
+    "        \"I'm very worried about\",\n",
+    "        \"My comments re:\",\n",
+    "        \"In reference to\",\n",
+    "        \"I am a voter worried about\",\n",
+    "        \"I'm a voter worried about\",\n",
+    "        \"Regarding\",\n",
+    "        \"With respect to\",\n",
+    "        \"In the matter of\",\n",
+    "    ],\n",
+    "\n",
+    "    [ \" \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"the FCC's so-called Open Internet order\",\n",
+    "        \"Internet regulation and net neutrality\",\n",
+    "        \"the Obama takeover of the Internet\",        \n",
+    "        \"the FCC regulations on the Internet\",\n",
+    "        \"network neutrality regulations\",\n",
+    "        \"the FCC's Open Internet order\",\n",
+    "        \"the FCC rules on the Internet\",\n",
+    "        \"net neutrality and Title II\",\n",
+    "        \"Net Neutrality and Title II\",\n",
+    "        \"regulations on the Internet\",\n",
+    "        \"restoring Internet freedom\",\n",
+    "        \"net neutrality regulations\",\n",
+    "        \"Title 2 and net neutrality\",\n",
+    "        \"the future of the Internet\",\n",
+    "        \"the Open Internet order\",\n",
+    "        \"internet regulations\",\n",
+    "        \"net neutrality rules\",\n",
+    "        \"Internet regulation\",\n",
+    "        \"Network Neutrality\",\n",
+    "        \"an open Internet\",\n",
+    "        \"Internet freedom\",\n",
+    "        \"Internet Freedom\",\n",
+    "        \"Net neutrality\",\n",
+    "        \"net neutrality\",\n",
+    "        \"NET NEUTRALITY\",\n",
+    "        \"Title II rules\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \". I\" ],\n",
+    "\n",
+    "    [\n",
+    "        \"'d like to\",\n",
+    "        \" would like to\",\n",
+    "        \" want to\",\n",
+    "        \" strongly\",\n",
+    "        \"\",\n",
+    "    ],\n",
+    "    \n",
+    "    [\n",
+    "        \" \"\n",
+    "    ],\n",
+    "    \n",
+    "    [\n",
+    "        \"implore\",\n",
+    "        \"ask\",\n",
+    "        \"request\",\n",
+    "        \"urge\",\n",
+    "        \"encourage\",\n",
+    "        \"recommend\",\n",
+    "        \"suggest\",\n",
+    "        \"demand\",\n",
+    "        \"advocate\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" \" ],\n",
+    "\n",
+    "    [\n",
+    "        \"you\",\n",
+    "        \"the FCC\",\n",
+    "        \"the Federal Communications Commission\",\n",
+    "        \"the commissioners\",\n",
+    "        \"the commission\",\n",
+    "        \"Chairman Pai\",\n",
+    "        \"Ajit Pai\",\n",
+    "        \"the government\"\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" to \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"undo\",\n",
+    "        \"reverse\",\n",
+    "        \"repeal\",\n",
+    "        \"overturn\",\n",
+    "        \"rescind\",\n",
+    "    ],\n",
+    "\n",
+    "    [ \" \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"The previous administration's\",\n",
+    "        \"The Obama/Wheeler\",\n",
+    "        \"President Obama's\",\n",
+    "        \"Barack Obama's\",\n",
+    "        \"Tom Wheeler's\",\n",
+    "        \"Obama's\",\n",
+    "    ],\n",
+    "\n",
+    "    [ \" \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"decision\",\n",
+    "        \"scheme\",\n",
+    "        \"policy\",\n",
+    "        \"order\",\n",
+    "        \"power grab\",\n",
+    "        \"plan\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" to \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"regulate\",\n",
+    "        \"control\",\n",
+    "        \"take over\",\n",
+    "    ],\n",
+    "\n",
+    "    [ \" \" ],\n",
+    "\n",
+    "    \n",
+    "    [\n",
+    "        \"broadband\",\n",
+    "        \"the web\",\n",
+    "        \"Internet access\",\n",
+    "        \"the Internet\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \". \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"Internet users\",\n",
+    "        \"Individual citizens\",\n",
+    "        \"People like me\",\n",
+    "        \"Citizens\",\n",
+    "        \"Individual Americans\",\n",
+    "        \"Americans\",\n",
+    "        \"Individuals\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \", \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"rather than\",\n",
+    "        \"as opposed to\",\n",
+    "        \"not\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"Washington bureaucrats\",\n",
+    "        \"Washington\",\n",
+    "        \"big government\",\n",
+    "        \"so-called experts\",\n",
+    "        \"unelected bureaucrats\",\n",
+    "        \"the FCC Enforcement Bureau\",\n",
+    "        \"the FCC\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \", \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"should be able to\",\n",
+    "        \"should be empowered to\",\n",
+    "        \"should be free to\",\n",
+    "        \"ought to\",\n",
+    "        \"deserve to\",\n",
+    "        \"should\",\n",
+    "    ],\n",
+    "    \n",
+    "    [\n",
+    "        \" \",\n",
+    "    ],\n",
+    "    \n",
+    "    [\n",
+    "        \"use\",\n",
+    "        \"enjoy\",\n",
+    "        \"purchase\",\n",
+    "        \"buy\",\n",
+    "        \"select\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"the\",\n",
+    "        \"whichever\",\n",
+    "        \"whatever\",\n",
+    "        \"which\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" \" ],\n",
+    "        \n",
+    "    [\n",
+    "        \"products\",\n",
+    "        \"applications\",\n",
+    "        \"services\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" \" ],\n",
+    "\n",
+    "    [\n",
+    "        \"they\",\n",
+    "        \"we\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"want\",\n",
+    "        \"desire\",\n",
+    "        \"prefer\",\n",
+    "        \"choose\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \". \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"The previous administration's\",\n",
+    "        \"The Obama/Wheeler\",\n",
+    "        \"President Obama's\",\n",
+    "        \"Barack Obama's\",\n",
+    "        \"Tom Wheeler's\",\n",
+    "        \"Obama's\",\n",
+    "    ],\n",
+    "\n",
+    "    [ \" \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"decision\",\n",
+    "        \"scheme\",\n",
+    "        \"policy\",\n",
+    "        \"order\",\n",
+    "        \"power grab\",\n",
+    "        \"plan\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" to \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"regulate\",\n",
+    "        \"control\",\n",
+    "        \"take over\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"broadband\",\n",
+    "        \"the web\",\n",
+    "        \"Internet access\",\n",
+    "        \"the Internet\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" is a \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"exploitation \",\n",
+    "        \"distortion\",\n",
+    "        \"perversion\",\n",
+    "        \"corruption\",\n",
+    "        \"betrayal\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" of \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"net neutrality\",\n",
+    "        \"the open Internet\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \". It \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"disrupted\",\n",
+    "        \"undid\",\n",
+    "        \"reversed\",\n",
+    "        \"ended\",\n",
+    "        \"broke\",\n",
+    "        \"stopped\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" a \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"light-touch\",\n",
+    "        \"pro-consumer\",\n",
+    "        \"hands-off\",\n",
+    "        \"free-market\",\n",
+    "        \"market-based\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"policy\",\n",
+    "        \"system\",\n",
+    "        \"approach\",\n",
+    "        \"framework\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" that \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"functioned\",\n",
+    "        \"performed\",\n",
+    "        \"worked\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"supremely\",\n",
+    "        \"very, very\",\n",
+    "        \"very\",\n",
+    "        \"remarkably\",\n",
+    "        \"fabulously\",\n",
+    "        \"exceptionally\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"well\",\n",
+    "        \"successfully\",\n",
+    "        \"smoothly\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" for \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"many years\",\n",
+    "        \"decades\",\n",
+    "        \"a long time\",\n",
+    "        \"two decades\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \" with \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"nearly universal\",\n",
+    "        \"broad bipartisan\",\n",
+    "        \"bipartisan\",\n",
+    "        \"both parties'\",\n",
+    "        \"Republican and Democrat\",\n",
+    "    ],\n",
+    "\n",
+    "    [ \" \" ],\n",
+    "    \n",
+    "    [\n",
+    "        \"support\",\n",
+    "        \"consensus\",\n",
+    "        \"approval\",\n",
+    "        \"backing\",\n",
+    "    ],\n",
+    "    \n",
+    "    [ \".\" ]\n",
+    "    \n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Check that pattern fully matches comments\n",
+    "\n",
+    "Here, we compile the comment segments into a single regular expression, which we use to check whether comments match the reverse-engineered model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def segments_to_pattern(segments):\n",
+    "    return re.compile(r\"^\" + r\"\".join(\n",
+    "    r\"(\" + r\"|\".join(re.escape(option) for option in seg) + r\")\"\n",
+    "        for seg in segments) + r\"$\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pattern = segments_to_pattern(segments)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "All comments match (otherwise, the result would be greater than zero):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sum(re.match(pattern, x) is None for x in mb_deduped_random[\"comments\"].values)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Check that there are no superfluous permutations\n",
+    "\n",
+    "Although the model above succeeds in matching all comments, so would a model that contained, for example, the entire English language. So here we check whether any individual part of the pattern is superfluous, by incrementally removing each one, and seeing whether the comments still match the pattern. (Here we use a random sample of comments, to speed up the process.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_comments = (\n",
+    "    mb_deduped_random\n",
+    "    [\"comments\"]\n",
+    "    .sample(1000, random_state = 0)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# A lack of output for this cell is a good thing;\n",
+    "# it means no part of the model is superfluous\n",
+    "\n",
+    "for i, segment in enumerate(segments):\n",
+    "    # For each sub-part of the each segment ...\n",
+    "    for j, option in enumerate(segment):\n",
+    "        \n",
+    "        # Replace the sub-part with \"###\", and then test\n",
+    "        # whether the pattern-matching fails. It should fail;\n",
+    "        # if it does not, then the sub-part is superfluous.\n",
+    "        segments_copy = list([ list(o) for o in segments ])\n",
+    "        segments_copy[i][j] = \"###\"\n",
+    "        new_pattern = segments_to_pattern(segments_copy)\n",
+    "        \n",
+    "        num_nonmatching_comments = sum((re.match(new_pattern, x) is None)\n",
+    "            for x in sample_comments.values)\n",
+    "        \n",
+    "        # If all of the comments still match after the \"###\" \n",
+    "        # substitution, then the replaced sub-part isn't necessary\n",
+    "        # to the model.\n",
+    "        if num_nonmatching_comments == 0:\n",
+    "            print(i, j, option)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Check that segments are randomized independently\n",
+    "\n",
+    "In some text-generation models, the value of one segment may influence the possible values (or weights for those values) of subsequent segments. Here, we check whether that appears to be true for the actual model that generated these comments.\n",
+    "\n",
+    "First, we extract the bits of text that each comment has used for each section, skipping the \"fixed\" segments. (Here again we use a random sample of comments, to speed things up.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[2, 4, 6]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "FIXED_SEGMENT_INDEX = [ i for i, x in enumerate(segments) if len(x) == 1 ]\n",
+    "FIXED_SEGMENT_INDEX[:3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_permutations(comment):\n",
+    "    permutations = [ (i, g) for i, g in enumerate(re.match(pattern, comment).groups())\n",
+    "        if i not in FIXED_SEGMENT_INDEX ]\n",
+    "    \n",
+    "    return pd.DataFrame(\n",
+    "        permutations,\n",
+    "        columns = [ \"seg_i\", \"option\" ],        \n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Example, for the first comment in the sample:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>seg_i</th>\n",
+       "      <th>option</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Dear Chairman Pai,</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>I would like to comment on</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Internet regulation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>5</td>\n",
+       "      <td>strongly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>7</td>\n",
+       "      <td>recommend</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>9</td>\n",
+       "      <td>Chairman Pai</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>11</td>\n",
+       "      <td>repeal</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>13</td>\n",
+       "      <td>Obama's</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>8</td>\n",
+       "      <td>15</td>\n",
+       "      <td>scheme</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>9</td>\n",
+       "      <td>17</td>\n",
+       "      <td>regulate</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>10</td>\n",
+       "      <td>19</td>\n",
+       "      <td>the web</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>11</td>\n",
+       "      <td>21</td>\n",
+       "      <td>Americans</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>12</td>\n",
+       "      <td>23</td>\n",
+       "      <td>as opposed to</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>13</td>\n",
+       "      <td>25</td>\n",
+       "      <td>Washington bureaucrats</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>14</td>\n",
+       "      <td>27</td>\n",
+       "      <td>should</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>15</td>\n",
+       "      <td>29</td>\n",
+       "      <td>purchase</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>16</td>\n",
+       "      <td>31</td>\n",
+       "      <td>the</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>17</td>\n",
+       "      <td>33</td>\n",
+       "      <td>products</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>18</td>\n",
+       "      <td>35</td>\n",
+       "      <td>they</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>19</td>\n",
+       "      <td>37</td>\n",
+       "      <td>prefer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>20</td>\n",
+       "      <td>39</td>\n",
+       "      <td>Obama's</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>21</td>\n",
+       "      <td>41</td>\n",
+       "      <td>scheme</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>22</td>\n",
+       "      <td>43</td>\n",
+       "      <td>regulate</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>23</td>\n",
+       "      <td>45</td>\n",
+       "      <td>the web</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>24</td>\n",
+       "      <td>47</td>\n",
+       "      <td>betrayal</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>25</td>\n",
+       "      <td>49</td>\n",
+       "      <td>the open Internet</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>26</td>\n",
+       "      <td>51</td>\n",
+       "      <td>stopped</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>27</td>\n",
+       "      <td>53</td>\n",
+       "      <td>free-market</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>28</td>\n",
+       "      <td>55</td>\n",
+       "      <td>system</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>29</td>\n",
+       "      <td>57</td>\n",
+       "      <td>functioned</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>30</td>\n",
+       "      <td>59</td>\n",
+       "      <td>supremely</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>31</td>\n",
+       "      <td>61</td>\n",
+       "      <td>well</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>32</td>\n",
+       "      <td>63</td>\n",
+       "      <td>decades</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>33</td>\n",
+       "      <td>65</td>\n",
+       "      <td>broad bipartisan</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>34</td>\n",
+       "      <td>67</td>\n",
+       "      <td>backing</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    seg_i                      option\n",
+       "0       0        Dear Chairman Pai,  \n",
+       "1       1  I would like to comment on\n",
+       "2       3         Internet regulation\n",
+       "3       5                    strongly\n",
+       "4       7                   recommend\n",
+       "5       9                Chairman Pai\n",
+       "6      11                      repeal\n",
+       "7      13                     Obama's\n",
+       "8      15                      scheme\n",
+       "9      17                    regulate\n",
+       "10     19                     the web\n",
+       "11     21                   Americans\n",
+       "12     23               as opposed to\n",
+       "13     25      Washington bureaucrats\n",
+       "14     27                      should\n",
+       "15     29                    purchase\n",
+       "16     31                         the\n",
+       "17     33                    products\n",
+       "18     35                        they\n",
+       "19     37                      prefer\n",
+       "20     39                     Obama's\n",
+       "21     41                      scheme\n",
+       "22     43                    regulate\n",
+       "23     45                     the web\n",
+       "24     47                    betrayal\n",
+       "25     49           the open Internet\n",
+       "26     51                     stopped\n",
+       "27     53                 free-market\n",
+       "28     55                      system\n",
+       "29     57                  functioned\n",
+       "30     59                   supremely\n",
+       "31     61                        well\n",
+       "32     63                     decades\n",
+       "33     65            broad bipartisan\n",
+       "34     67                     backing"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "extract_permutations(sample_comments.iloc[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create a DataFrame of all extracted segments:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>seg_i</th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>3</th>\n",
+       "      <th>5</th>\n",
+       "      <th>7</th>\n",
+       "      <th>9</th>\n",
+       "      <th>11</th>\n",
+       "      <th>13</th>\n",
+       "      <th>15</th>\n",
+       "      <th>17</th>\n",
+       "      <th>...</th>\n",
+       "      <th>49</th>\n",
+       "      <th>51</th>\n",
+       "      <th>53</th>\n",
+       "      <th>55</th>\n",
+       "      <th>57</th>\n",
+       "      <th>59</th>\n",
+       "      <th>61</th>\n",
+       "      <th>63</th>\n",
+       "      <th>65</th>\n",
+       "      <th>67</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>comment_i</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>Dear Chairman Pai,</td>\n",
+       "      <td>I would like to comment on</td>\n",
+       "      <td>Internet regulation</td>\n",
+       "      <td>strongly</td>\n",
+       "      <td>recommend</td>\n",
+       "      <td>Chairman Pai</td>\n",
+       "      <td>repeal</td>\n",
+       "      <td>Obama's</td>\n",
+       "      <td>scheme</td>\n",
+       "      <td>regulate</td>\n",
+       "      <td>...</td>\n",
+       "      <td>the open Internet</td>\n",
+       "      <td>stopped</td>\n",
+       "      <td>free-market</td>\n",
+       "      <td>system</td>\n",
+       "      <td>functioned</td>\n",
+       "      <td>supremely</td>\n",
+       "      <td>well</td>\n",
+       "      <td>decades</td>\n",
+       "      <td>broad bipartisan</td>\n",
+       "      <td>backing</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>To the Federal Communications Commission:</td>\n",
+       "      <td>I'm concerned about</td>\n",
+       "      <td>network neutrality regulations</td>\n",
+       "      <td>'d like to</td>\n",
+       "      <td>request</td>\n",
+       "      <td>the government</td>\n",
+       "      <td>undo</td>\n",
+       "      <td>The previous administration's</td>\n",
+       "      <td>order</td>\n",
+       "      <td>control</td>\n",
+       "      <td>...</td>\n",
+       "      <td>net neutrality</td>\n",
+       "      <td>broke</td>\n",
+       "      <td>market-based</td>\n",
+       "      <td>framework</td>\n",
+       "      <td>functioned</td>\n",
+       "      <td>remarkably</td>\n",
+       "      <td>smoothly</td>\n",
+       "      <td>many years</td>\n",
+       "      <td>nearly universal</td>\n",
+       "      <td>backing</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>Chairman Pai:</td>\n",
+       "      <td>My comments re:</td>\n",
+       "      <td>regulations on the Internet</td>\n",
+       "      <td>'d like to</td>\n",
+       "      <td>suggest</td>\n",
+       "      <td>Ajit Pai</td>\n",
+       "      <td>rescind</td>\n",
+       "      <td>Obama's</td>\n",
+       "      <td>scheme</td>\n",
+       "      <td>take over</td>\n",
+       "      <td>...</td>\n",
+       "      <td>the open Internet</td>\n",
+       "      <td>stopped</td>\n",
+       "      <td>free-market</td>\n",
+       "      <td>system</td>\n",
+       "      <td>functioned</td>\n",
+       "      <td>very, very</td>\n",
+       "      <td>smoothly</td>\n",
+       "      <td>decades</td>\n",
+       "      <td>both parties'</td>\n",
+       "      <td>approval</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>Dear Mr. Pai,</td>\n",
+       "      <td>Hi, I'd like to comment on</td>\n",
+       "      <td>the FCC rules on the Internet</td>\n",
+       "      <td></td>\n",
+       "      <td>ask</td>\n",
+       "      <td>Ajit Pai</td>\n",
+       "      <td>reverse</td>\n",
+       "      <td>The Obama/Wheeler</td>\n",
+       "      <td>scheme</td>\n",
+       "      <td>regulate</td>\n",
+       "      <td>...</td>\n",
+       "      <td>the open Internet</td>\n",
+       "      <td>reversed</td>\n",
+       "      <td>hands-off</td>\n",
+       "      <td>policy</td>\n",
+       "      <td>functioned</td>\n",
+       "      <td>remarkably</td>\n",
+       "      <td>smoothly</td>\n",
+       "      <td>many years</td>\n",
+       "      <td>Republican and Democrat</td>\n",
+       "      <td>consensus</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>Mr Pai:</td>\n",
+       "      <td>I'm contacting you about</td>\n",
+       "      <td>the FCC's Open Internet order</td>\n",
+       "      <td></td>\n",
+       "      <td>request</td>\n",
+       "      <td>the FCC</td>\n",
+       "      <td>repeal</td>\n",
+       "      <td>The Obama/Wheeler</td>\n",
+       "      <td>plan</td>\n",
+       "      <td>take over</td>\n",
+       "      <td>...</td>\n",
+       "      <td>the open Internet</td>\n",
+       "      <td>reversed</td>\n",
+       "      <td>light-touch</td>\n",
+       "      <td>system</td>\n",
+       "      <td>performed</td>\n",
+       "      <td>very, very</td>\n",
+       "      <td>smoothly</td>\n",
+       "      <td>many years</td>\n",
+       "      <td>Republican and Democrat</td>\n",
+       "      <td>backing</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 35 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "seg_i                                               0   \\\n",
+       "comment_i                                                \n",
+       "0                                 Dear Chairman Pai,     \n",
+       "1          To the Federal Communications Commission:     \n",
+       "2                                      Chairman Pai:     \n",
+       "3                                      Dear Mr. Pai,     \n",
+       "4                                            Mr Pai:     \n",
+       "\n",
+       "seg_i                              1                               3   \\\n",
+       "comment_i                                                               \n",
+       "0          I would like to comment on             Internet regulation   \n",
+       "1                 I'm concerned about  network neutrality regulations   \n",
+       "2                     My comments re:     regulations on the Internet   \n",
+       "3          Hi, I'd like to comment on   the FCC rules on the Internet   \n",
+       "4            I'm contacting you about   the FCC's Open Internet order   \n",
+       "\n",
+       "seg_i              5          7               9        11  \\\n",
+       "comment_i                                                   \n",
+       "0            strongly  recommend    Chairman Pai   repeal   \n",
+       "1          'd like to    request  the government     undo   \n",
+       "2          'd like to    suggest        Ajit Pai  rescind   \n",
+       "3                            ask        Ajit Pai  reverse   \n",
+       "4                        request         the FCC   repeal   \n",
+       "\n",
+       "seg_i                                 13      15         17  ...  \\\n",
+       "comment_i                                                    ...   \n",
+       "0                                Obama's  scheme   regulate  ...   \n",
+       "1          The previous administration's   order    control  ...   \n",
+       "2                                Obama's  scheme  take over  ...   \n",
+       "3                      The Obama/Wheeler  scheme   regulate  ...   \n",
+       "4                      The Obama/Wheeler    plan  take over  ...   \n",
+       "\n",
+       "seg_i                     49        51            53         55          57  \\\n",
+       "comment_i                                                                     \n",
+       "0          the open Internet   stopped   free-market     system  functioned   \n",
+       "1             net neutrality     broke  market-based  framework  functioned   \n",
+       "2          the open Internet   stopped   free-market     system  functioned   \n",
+       "3          the open Internet  reversed     hands-off     policy  functioned   \n",
+       "4          the open Internet  reversed   light-touch     system   performed   \n",
+       "\n",
+       "seg_i              59        61          63                       65  \\\n",
+       "comment_i                                                              \n",
+       "0           supremely      well     decades         broad bipartisan   \n",
+       "1          remarkably  smoothly  many years         nearly universal   \n",
+       "2          very, very  smoothly     decades            both parties'   \n",
+       "3          remarkably  smoothly  many years  Republican and Democrat   \n",
+       "4          very, very  smoothly  many years  Republican and Democrat   \n",
+       "\n",
+       "seg_i             67  \n",
+       "comment_i             \n",
+       "0            backing  \n",
+       "1            backing  \n",
+       "2           approval  \n",
+       "3          consensus  \n",
+       "4            backing  \n",
+       "\n",
+       "[5 rows x 35 columns]"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "extracted = (\n",
+    "    pd.concat([ extract_permutations(x).assign(comment_i = i)\n",
+    "        for i, x in enumerate(sample_comments) ])\n",
+    "    .set_index([\n",
+    "        \"comment_i\",\n",
+    "        \"seg_i\",\n",
+    "    ])\n",
+    "    [\"option\"]\n",
+    "    .unstack()\n",
+    ")\n",
+    "\n",
+    "extracted.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To test for the independence of randomization, we calculate the correlation between any two segments in a comment:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>seg_a</th>\n",
+       "      <th>seg_b</th>\n",
+       "      <th>corr</th>\n",
+       "      <th>seg_int_a</th>\n",
+       "      <th>seg_int_b</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>2508</td>\n",
+       "      <td>0_</td>\n",
+       "      <td>1_Hi, I'd like to comment on</td>\n",
+       "      <td>0.035640</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2509</td>\n",
+       "      <td>0_Chairman Pai:</td>\n",
+       "      <td>1_Hi, I'd like to comment on</td>\n",
+       "      <td>-0.043523</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2510</td>\n",
+       "      <td>0_Dear Chairman Pai,</td>\n",
+       "      <td>1_Hi, I'd like to comment on</td>\n",
+       "      <td>-0.030419</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2511</td>\n",
+       "      <td>0_Dear Commissioners:</td>\n",
+       "      <td>1_Hi, I'd like to comment on</td>\n",
+       "      <td>-0.040624</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2512</td>\n",
+       "      <td>0_Dear FCC,</td>\n",
+       "      <td>1_Hi, I'd like to comment on</td>\n",
+       "      <td>-0.005508</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                        seg_a                         seg_b      corr  \\\n",
+       "2508                       0_  1_Hi, I'd like to comment on  0.035640   \n",
+       "2509        0_Chairman Pai:    1_Hi, I'd like to comment on -0.043523   \n",
+       "2510   0_Dear Chairman Pai,    1_Hi, I'd like to comment on -0.030419   \n",
+       "2511  0_Dear Commissioners:    1_Hi, I'd like to comment on -0.040624   \n",
+       "2512            0_Dear FCC,    1_Hi, I'd like to comment on -0.005508   \n",
+       "\n",
+       "      seg_int_a  seg_int_b  \n",
+       "2508          0          1  \n",
+       "2509          0          1  \n",
+       "2510          0          1  \n",
+       "2511          0          1  \n",
+       "2512          0          1  "
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "segment_correlations = (\n",
+    "    # Turn each permutation into a dummy variable\n",
+    "    extracted\n",
+    "    .pipe(pd.get_dummies)\n",
+    "    \n",
+    "    # Calculate the correlations between them\n",
+    "    .corr()\n",
+    "    .reset_index()\n",
+    "    .rename(columns = { \"index\": \"seg_a\" })\n",
+    "    \n",
+    "    # Melt the correlation matrix into a long/tidy DataFrame\n",
+    "    .melt(\n",
+    "        id_vars = [ \"seg_a\" ],\n",
+    "        var_name = \"seg_b\",\n",
+    "        value_name = \"corr\",\n",
+    "    )\n",
+    "    .assign(\n",
+    "        seg_int_a = lambda df: df[\"seg_a\"].str.extract(r\"^(\\d+)\", expand = False).astype(int),\n",
+    "        seg_int_b = lambda df: df[\"seg_b\"].str.extract(r\"^(\\d+)\", expand = False).astype(int),\n",
+    "    )\n",
+    "    \n",
+    "    # Take only the first correlation (A•B instead of both A•B and B•A)\n",
+    "    # and ignore self-correlations\n",
+    "    .loc[lambda df: df[\"seg_a\"] < df[\"seg_b\"]]\n",
+    "    \n",
+    "    # Ignore correlations within the same segment, since they are mutually exclusive\n",
+    "    .loc[lambda df: df[\"seg_int_a\"] != df[\"seg_int_b\"]]\n",
+    ")\n",
+    "\n",
+    "segment_correlations.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The output below demonstrates that are only a handful of pairs with a correlation above 0.15; they are all perfect correlations, meaning that the first segment choice guarantees the second. In this case, whatever is chosen for segments `13-19` is repeated for segments `39-45`. (Segments 14, 16, etc. are all fixed segments, and don't vary at all.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>seg_a</th>\n",
+       "      <th>seg_b</th>\n",
+       "      <th>corr</th>\n",
+       "      <th>seg_int_a</th>\n",
+       "      <th>seg_int_b</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>29970</td>\n",
+       "      <td>13_Barack Obama's</td>\n",
+       "      <td>39_Barack Obama's</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>30180</td>\n",
+       "      <td>13_Obama's</td>\n",
+       "      <td>39_Obama's</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>30390</td>\n",
+       "      <td>13_President Obama's</td>\n",
+       "      <td>39_President Obama's</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>30600</td>\n",
+       "      <td>13_The Obama/Wheeler</td>\n",
+       "      <td>39_The Obama/Wheeler</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>30810</td>\n",
+       "      <td>13_The previous administration's</td>\n",
+       "      <td>39_The previous administration's</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>31020</td>\n",
+       "      <td>13_Tom Wheeler's</td>\n",
+       "      <td>39_Tom Wheeler's</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>31230</td>\n",
+       "      <td>15_decision</td>\n",
+       "      <td>41_decision</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>15</td>\n",
+       "      <td>41</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>31440</td>\n",
+       "      <td>15_order</td>\n",
+       "      <td>41_order</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>15</td>\n",
+       "      <td>41</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>31650</td>\n",
+       "      <td>15_plan</td>\n",
+       "      <td>41_plan</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>15</td>\n",
+       "      <td>41</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>31860</td>\n",
+       "      <td>15_policy</td>\n",
+       "      <td>41_policy</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>15</td>\n",
+       "      <td>41</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>32070</td>\n",
+       "      <td>15_power grab</td>\n",
+       "      <td>41_power grab</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>15</td>\n",
+       "      <td>41</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>32280</td>\n",
+       "      <td>15_scheme</td>\n",
+       "      <td>41_scheme</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>15</td>\n",
+       "      <td>41</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>32490</td>\n",
+       "      <td>17_control</td>\n",
+       "      <td>43_control</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>17</td>\n",
+       "      <td>43</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>32700</td>\n",
+       "      <td>17_regulate</td>\n",
+       "      <td>43_regulate</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>17</td>\n",
+       "      <td>43</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>32910</td>\n",
+       "      <td>17_take over</td>\n",
+       "      <td>43_take over</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>17</td>\n",
+       "      <td>43</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>33120</td>\n",
+       "      <td>19_Internet access</td>\n",
+       "      <td>45_Internet access</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>19</td>\n",
+       "      <td>45</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>33330</td>\n",
+       "      <td>19_broadband</td>\n",
+       "      <td>45_broadband</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>19</td>\n",
+       "      <td>45</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>33540</td>\n",
+       "      <td>19_the Internet</td>\n",
+       "      <td>45_the Internet</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>19</td>\n",
+       "      <td>45</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>33750</td>\n",
+       "      <td>19_the web</td>\n",
+       "      <td>45_the web</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>19</td>\n",
+       "      <td>45</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                  seg_a                             seg_b  \\\n",
+       "29970                 13_Barack Obama's                 39_Barack Obama's   \n",
+       "30180                        13_Obama's                        39_Obama's   \n",
+       "30390              13_President Obama's              39_President Obama's   \n",
+       "30600              13_The Obama/Wheeler              39_The Obama/Wheeler   \n",
+       "30810  13_The previous administration's  39_The previous administration's   \n",
+       "31020                  13_Tom Wheeler's                  39_Tom Wheeler's   \n",
+       "31230                       15_decision                       41_decision   \n",
+       "31440                          15_order                          41_order   \n",
+       "31650                           15_plan                           41_plan   \n",
+       "31860                         15_policy                         41_policy   \n",
+       "32070                     15_power grab                     41_power grab   \n",
+       "32280                         15_scheme                         41_scheme   \n",
+       "32490                        17_control                        43_control   \n",
+       "32700                       17_regulate                       43_regulate   \n",
+       "32910                      17_take over                      43_take over   \n",
+       "33120                19_Internet access                45_Internet access   \n",
+       "33330                      19_broadband                      45_broadband   \n",
+       "33540                   19_the Internet                   45_the Internet   \n",
+       "33750                        19_the web                        45_the web   \n",
+       "\n",
+       "       corr  seg_int_a  seg_int_b  \n",
+       "29970   1.0         13         39  \n",
+       "30180   1.0         13         39  \n",
+       "30390   1.0         13         39  \n",
+       "30600   1.0         13         39  \n",
+       "30810   1.0         13         39  \n",
+       "31020   1.0         13         39  \n",
+       "31230   1.0         15         41  \n",
+       "31440   1.0         15         41  \n",
+       "31650   1.0         15         41  \n",
+       "31860   1.0         15         41  \n",
+       "32070   1.0         15         41  \n",
+       "32280   1.0         15         41  \n",
+       "32490   1.0         17         43  \n",
+       "32700   1.0         17         43  \n",
+       "32910   1.0         17         43  \n",
+       "33120   1.0         19         45  \n",
+       "33330   1.0         19         45  \n",
+       "33540   1.0         19         45  \n",
+       "33750   1.0         19         45  "
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    segment_correlations\n",
+    "    .loc[lambda df: df[\"corr\"] > 0.15]\n",
+    "    .sort_values(\"seg_a\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The output below demonstrates that no segment pairs with a correlation below -0.15, other than the possibilities inherently excluded by the perfect correlations above."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>seg_a</th>\n",
+       "      <th>seg_b</th>\n",
+       "      <th>corr</th>\n",
+       "      <th>seg_int_a</th>\n",
+       "      <th>seg_int_b</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>31015</td>\n",
+       "      <td>13_Barack Obama's</td>\n",
+       "      <td>39_Tom Wheeler's</td>\n",
+       "      <td>-0.203366</td>\n",
+       "      <td>13</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>30806</td>\n",
+       "      <td>13_Barack Obama's</td>\n",
+       "      <td>39_The previous administration's</td>\n",
+       "      <td>-0.191720</td>\n",
+       "      <td>13</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>30179</td>\n",
+       "      <td>13_Barack Obama's</td>\n",
+       "      <td>39_Obama's</td>\n",
+       "      <td>-0.204086</td>\n",
+       "      <td>13</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>30597</td>\n",
+       "      <td>13_Barack Obama's</td>\n",
+       "      <td>39_The Obama/Wheeler</td>\n",
+       "      <td>-0.204086</td>\n",
+       "      <td>13</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>30388</td>\n",
+       "      <td>13_Barack Obama's</td>\n",
+       "      <td>39_President Obama's</td>\n",
+       "      <td>-0.200477</td>\n",
+       "      <td>13</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>33331</td>\n",
+       "      <td>19_the Internet</td>\n",
+       "      <td>45_broadband</td>\n",
+       "      <td>-0.327781</td>\n",
+       "      <td>19</td>\n",
+       "      <td>45</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>33749</td>\n",
+       "      <td>19_the Internet</td>\n",
+       "      <td>45_the web</td>\n",
+       "      <td>-0.337228</td>\n",
+       "      <td>19</td>\n",
+       "      <td>45</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>33123</td>\n",
+       "      <td>19_the web</td>\n",
+       "      <td>45_Internet access</td>\n",
+       "      <td>-0.338160</td>\n",
+       "      <td>19</td>\n",
+       "      <td>45</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>33332</td>\n",
+       "      <td>19_the web</td>\n",
+       "      <td>45_broadband</td>\n",
+       "      <td>-0.355864</td>\n",
+       "      <td>19</td>\n",
+       "      <td>45</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>33541</td>\n",
+       "      <td>19_the web</td>\n",
+       "      <td>45_the Internet</td>\n",
+       "      <td>-0.337228</td>\n",
+       "      <td>19</td>\n",
+       "      <td>45</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>78 rows × 5 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                   seg_a                             seg_b      corr  \\\n",
+       "31015  13_Barack Obama's                  39_Tom Wheeler's -0.203366   \n",
+       "30806  13_Barack Obama's  39_The previous administration's -0.191720   \n",
+       "30179  13_Barack Obama's                        39_Obama's -0.204086   \n",
+       "30597  13_Barack Obama's              39_The Obama/Wheeler -0.204086   \n",
+       "30388  13_Barack Obama's              39_President Obama's -0.200477   \n",
+       "...                  ...                               ...       ...   \n",
+       "33331    19_the Internet                      45_broadband -0.327781   \n",
+       "33749    19_the Internet                        45_the web -0.337228   \n",
+       "33123         19_the web                45_Internet access -0.338160   \n",
+       "33332         19_the web                      45_broadband -0.355864   \n",
+       "33541         19_the web                   45_the Internet -0.337228   \n",
+       "\n",
+       "       seg_int_a  seg_int_b  \n",
+       "31015         13         39  \n",
+       "30806         13         39  \n",
+       "30179         13         39  \n",
+       "30597         13         39  \n",
+       "30388         13         39  \n",
+       "...          ...        ...  \n",
+       "33331         19         45  \n",
+       "33749         19         45  \n",
+       "33123         19         45  \n",
+       "33332         19         45  \n",
+       "33541         19         45  \n",
+       "\n",
+       "[78 rows x 5 columns]"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    segment_correlations\n",
+    "    .loc[lambda df: df[\"corr\"] < -0.15]\n",
+    "    .sort_values(\"seg_a\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>seg_a</th>\n",
+       "      <th>seg_b</th>\n",
+       "      <th>corr</th>\n",
+       "      <th>seg_int_a</th>\n",
+       "      <th>seg_int_b</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "Empty DataFrame\n",
+       "Columns: [seg_a, seg_b, corr, seg_int_a, seg_int_b]\n",
+       "Index: []"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(\n",
+    "    segment_correlations\n",
+    "    .loc[lambda df: df[\"corr\"] < -0.15]\n",
+    "    .loc[lambda df: ~df[\"seg_int_a\"].isin([ 13, 15, 17, 19 ])]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Show the repeated segments"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Segments `13-19`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[\n",
+      "  [\n",
+      "    \"The previous administration's\",\n",
+      "    \"The Obama/Wheeler\",\n",
+      "    \"President Obama's\",\n",
+      "    \"Barack Obama's\",\n",
+      "    \"Tom Wheeler's\",\n",
+      "    \"Obama's\"\n",
+      "  ],\n",
+      "  [\n",
+      "    \" \"\n",
+      "  ],\n",
+      "  [\n",
+      "    \"decision\",\n",
+      "    \"scheme\",\n",
+      "    \"policy\",\n",
+      "    \"order\",\n",
+      "    \"power grab\",\n",
+      "    \"plan\"\n",
+      "  ],\n",
+      "  [\n",
+      "    \" to \"\n",
+      "  ],\n",
+      "  [\n",
+      "    \"regulate\",\n",
+      "    \"control\",\n",
+      "    \"take over\"\n",
+      "  ],\n",
+      "  [\n",
+      "    \" \"\n",
+      "  ],\n",
+      "  [\n",
+      "    \"broadband\",\n",
+      "    \"the web\",\n",
+      "    \"Internet access\",\n",
+      "    \"the Internet\"\n",
+      "  ]\n",
+      "]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(json.dumps(segments[13:20], indent = 2))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Segments `39-45`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[\n",
+      "  [\n",
+      "    \"The previous administration's\",\n",
+      "    \"The Obama/Wheeler\",\n",
+      "    \"President Obama's\",\n",
+      "    \"Barack Obama's\",\n",
+      "    \"Tom Wheeler's\",\n",
+      "    \"Obama's\"\n",
+      "  ],\n",
+      "  [\n",
+      "    \" \"\n",
+      "  ],\n",
+      "  [\n",
+      "    \"decision\",\n",
+      "    \"scheme\",\n",
+      "    \"policy\",\n",
+      "    \"order\",\n",
+      "    \"power grab\",\n",
+      "    \"plan\"\n",
+      "  ],\n",
+      "  [\n",
+      "    \" to \"\n",
+      "  ],\n",
+      "  [\n",
+      "    \"regulate\",\n",
+      "    \"control\",\n",
+      "    \"take over\"\n",
+      "  ],\n",
+      "  [\n",
+      "    \" \"\n",
+      "  ],\n",
+      "  [\n",
+      "    \"broadband\",\n",
+      "    \"the web\",\n",
+      "    \"Internet access\",\n",
+      "    \"the Internet\"\n",
+      "  ]\n",
+      "]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(json.dumps(segments[39:46], indent = 2))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Calculate possible permutations\n",
+    "\n",
+    "Below, we calculate the total possible permutations, with care to exclude the perfectly correlated segments (which we do by simply removing them from the calculation)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate_permutations(segments):\n",
+    "    count = reduce(lambda x, y: x * y, map(len, segments))\n",
+    "    print(f\"Total permutations: {count:,d}\")\n",
+    "    \n",
+    "    log = math.log10(count)\n",
+    "    print(f\"Log10: {log:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def remove_segments(segments, indices):\n",
+    "    return [ s for i, s in enumerate(segments) if i not in indices ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total permutations: 9,584,250,725,597,184,000,000\n",
+      "Log10: 21.98\n"
+     ]
+    }
+   ],
+   "source": [
+    "calculate_permutations(remove_segments(segments, [ 39, 41, 43, 45 ]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "---\n",
+    "\n",
+    "---"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}