diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml
index 730fd08..ac346f4 100644
--- a/.github/workflows/tox-pytest.yml
+++ b/.github/workflows/tox-pytest.yml
@@ -11,7 +11,7 @@ jobs:
id-token: write
strategy:
matrix:
- python-version: ["3.10", "3.11"]
+ python-version: ["3.12"]
fail-fast: false
defaults:
run:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2aaf16a..7516290 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -35,7 +35,6 @@ repos:
rev: 24.10.0
hooks:
- id: black
- language_version: python3.11
- repo: https://github.com/pre-commit/mirrors-prettier
rev: v4.0.0-alpha.8
diff --git a/environment.yml b/environment.yml
index 4a985d5..89eadc8 100644
--- a/environment.yml
+++ b/environment.yml
@@ -5,7 +5,7 @@ channels:
dependencies:
# Packages required for setting up the environment
- pip>=21,<24
- - python>=3.10,<3.12
+ - python>=3.10,<=3.12
- setuptools>=66,<69
# Packages specified in setup.py that need or benefit from binary conda packages
@@ -19,11 +19,14 @@ dependencies:
# Jupyter packages:
- jupyterlab>=3.2,<4
- - nbconvert>=6,<7 # Used to clear notebook outputs in pre-commit hooks
+ - nbconvert>=7 # Used to clear notebook outputs in pre-commit hooks
# These are not normal Python packages available on PyPI
- nodejs # Useful for Jupyter and prettier pre-commit hook
+ - dask>=2024
+ - gdal
+
# Use pip to install the package defined by this repo for development:
- pip:
- --editable ./[dev,docs,tests,types]
diff --git a/notebooks/16-kl-splink-ex21-filer-link.ipynb b/notebooks/16-kl-splink-ex21-filer-link.ipynb
new file mode 100644
index 0000000..efef952
--- /dev/null
+++ b/notebooks/16-kl-splink-ex21-filer-link.ipynb
@@ -0,0 +1,5678 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "c535d97b-5dfa-4298-87f5-55c56c4c82ed",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "%load_ext autoreload\n",
+ "%autoreload 3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 260,
+ "id": "e1222c94-36cd-4bae-95fb-089e5411e490",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[autoreload of mozilla_sec_eia.models.sec10k.utils.cloud failed: Traceback (most recent call last):\n",
+ " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 274, in check\n",
+ " superreload(m, reload, self.old_objects, self.shell)\n",
+ " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 500, in superreload\n",
+ " update_generic(old_obj, new_obj)\n",
+ " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 397, in update_generic\n",
+ " update(a, b)\n",
+ " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 335, in update_class\n",
+ " if (old_obj == new_obj) is True:\n",
+ " ^^^^^^^^^^^^^^^^^^\n",
+ " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/_collections_abc.py\", line 834, in __eq__\n",
+ " return dict(self.items()) == dict(other.items())\n",
+ " ^^^^^^^^^^^^^^^^^^\n",
+ " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/_collections_abc.py\", line 893, in __iter__\n",
+ " for key in self._mapping:\n",
+ " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/pydantic/_internal/_mock_val_ser.py\", line 46, in __iter__\n",
+ " return self._get_built().__iter__()\n",
+ " ^^^^^^^^^^^^^^^^^\n",
+ " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/pydantic/_internal/_mock_val_ser.py\", line 57, in _get_built\n",
+ " raise PydanticUserError(self._error_message, code=self._code)\n",
+ "pydantic.errors.PydanticUserError: Pydantic models should inherit from BaseModel, BaseModel cannot be instantiated directly\n",
+ "\n",
+ "For further information visit https://errors.pydantic.dev/2.9/u/base-model-instantiated\n",
+ "]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from upath import UPath\n",
+ "\n",
+ "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, convert_ex21_id_to_filename\n",
+ "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df, add_sec_company_id_to_subsidiaries"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "16cd6122-4cb9-42aa-8be1-84c997a34e96",
+ "metadata": {},
+ "source": [
+ "# Read in Inputs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "c29d0b75-759f-445c-adac-b2a6baf1fd0e",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# for now try just training on 2023\n",
+ "raw_sec_df = pd.concat([pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q1.parquet\"),\n",
+ " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q2.parquet\"),\n",
+ " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q3.parquet\"),\n",
+ " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q4.parquet\"),\n",
+ " ]\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "dbf3b15c-3a5a-4b74-a929-71aec18750a1",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "raw_sec_df = raw_sec_df.reset_index().pivot_table(values=\"value\", index=\"filename\", columns=\"key\", aggfunc=\"first\")\n",
+ "raw_sec_df.columns.name = None"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "a8ec4fad-c92f-4cfc-a3d2-409a72a2df1e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ex21_path = UPath(\"gs://sec10k-outputs/v2/ex21_company_ownership_info\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "8e7a642d-7718-4101-b851-f1f4ee07180e",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "raw_ex21_df = pd.DataFrame()\n",
+ "for file in ex21_path.iterdir():\n",
+ " if file.name.split(\".\")[-1] == \"parquet\":\n",
+ " report_year = file.name[:4]\n",
+ " # for now just train with 2023\n",
+ " if report_year != \"2023\":\n",
+ " continue\n",
+ " year_quarter_df = pd.read_parquet(ex21_path / file.name)\n",
+ " year_quarter_df.loc[:, \"report_year\"] = report_year\n",
+ " year_quarter_df.loc[:, \"report_year\"] = pd.to_datetime(year_quarter_df[\"report_year\"], format=\"%Y\").dt.year\n",
+ " raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f3d5db08-3c42-4715-9f0d-4d02674b828a",
+ "metadata": {},
+ "source": [
+ "# Preprocessing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "39706c77-90db-4f49-8011-47a9777a88b6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sec_df = prepare_sec10k_basic_info_df(raw_sec_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 157,
+ "id": "98d4f59e-d61f-4a24-84bc-6caa0d761e07",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/katielamb/CatalystCoop/mozilla-sec-eia/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py:233: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
+ " )\n"
+ ]
+ }
+ ],
+ "source": [
+ "ex21_df = prepare_ex21_df(raw_ex21_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "id": "34a86ec8-5b6c-4147-8f94-021fa271174c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 69,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ex21_df.record_id.is_unique"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "id": "505b0c45-1748-4517-8cac-d2acf2fa9037",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 70,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_df.record_id.is_unique"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "11caf325-8530-430d-a3d2-a54043447021",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# sec_df has filename as unique ID\n",
+ "sec_df.filename.is_unique"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ceed053b-f6ae-4aad-8b12-b2083ba8e236",
+ "metadata": {},
+ "source": [
+ "Note: not removing paragraph layout docs, but maybe should"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6de284e1-2b76-418d-ac5e-9a84bd275c51",
+ "metadata": {},
+ "source": [
+ "# Try to just match on cleaned name and location"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 170,
+ "id": "2c9a384d-a9e1-4e4a-829f-e92f1a007c90",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sec_match_df = sec_df.drop_duplicates(subset=[\"central_index_key\", \"company_name\", \"loc_of_incorporation\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 179,
+ "id": "4bab406d-b1e0-495b-beee-90ae6b0c036b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "merged_df = sec_match_df.merge(ex21_df, how=\"inner\", on=\"company_name\", suffixes=(\"_sec\", \"_ex21\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 185,
+ "id": "b8732fda-9f0a-412c-b7ba-8f307ee7b213",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 florida\n",
+ "1 delaware\n",
+ "2 missouri\n",
+ "3 delaware\n",
+ "4 NaN\n",
+ " ... \n",
+ "515 delaware\n",
+ "516 delaware\n",
+ "517 delaware\n",
+ "518 delaware\n",
+ "519 delaware\n",
+ "Name: loc_of_incorporation_sec, Length: 520, dtype: object"
+ ]
+ },
+ "execution_count": 185,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_df[\"loc_of_incorporation_sec\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 209,
+ "id": "3427d77c-3c3f-4a05-99db-7f96d3f0f193",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "merged_df.loc[:, \"loc_tokens_sec\"] = merged_df[\"loc_of_incorporation_sec\"].fillna(\"\").str.lower().str.split()\n",
+ "merged_df.loc[:, \"loc_tokens_ex21\"] = merged_df[\"loc_of_incorporation_ex21\"].fillna(\"\").str.lower().str.split()\n",
+ "merged_df[\"loc_overlap\"] = merged_df.apply(\n",
+ " lambda row: len(set(row[\"loc_tokens_sec\"]) & set(row[\"loc_tokens_ex21\"])), axis=1\n",
+ ")\n",
+ "\n",
+ "# Select the row with the highest word overlap for each CIK and company name\n",
+ "closest_match = merged_df.loc[merged_df.groupby([\"central_index_key\", \"company_name\"])['loc_overlap'].idxmax()].reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 210,
+ "id": "92cc6570-f34c-4782-9bbf-0cdeaf2ce044",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False 480\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 210,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# this should be 0\n",
+ "closest_match.duplicated(subset=[\"company_name\", \"loc_of_incorporation_ex21\"]).value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 200,
+ "id": "d0c650d0-303d-43a4-9ae3-35c4fb6d481b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "central_index_key\n",
+ "False 480\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 200,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# it's okay if there's duplication here, but not ideal\n",
+ "# multiple subsidiaries can point to the same CIK\n",
+ "closest_match.central_index_key.duplicated().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 201,
+ "id": "2b3a2c1f-7df4-4515-8727-a339303ebd4e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " record_id_sec | \n",
+ " filename | \n",
+ " phone_number | \n",
+ " central_index_key | \n",
+ " city | \n",
+ " company_name_raw_sec | \n",
+ " date_of_name_change | \n",
+ " film_number | \n",
+ " fiscal_year_end | \n",
+ " form_type | \n",
+ " former_conformed_name | \n",
+ " irs_number | \n",
+ " organization_name | \n",
+ " sec_act | \n",
+ " sec_file_number | \n",
+ " standard_industrial_classification | \n",
+ " state | \n",
+ " state_of_incorporation | \n",
+ " street_address | \n",
+ " street_address_2 | \n",
+ " zip_code | \n",
+ " report_date | \n",
+ " report_year_sec | \n",
+ " loc_of_incorporation_sec | \n",
+ " company_name | \n",
+ " company_name_no_legal_sec | \n",
+ " company_name_mphone_sec | \n",
+ " record_id_ex21 | \n",
+ " id | \n",
+ " company_name_raw_ex21 | \n",
+ " loc_of_incorporation_ex21 | \n",
+ " own_per | \n",
+ " report_year_ex21 | \n",
+ " company_name_no_legal_ex21 | \n",
+ " company_name_mphone_ex21 | \n",
+ " loc_tokens_sec | \n",
+ " loc_tokens_ex21 | \n",
+ " loc_overlap | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 7990 | \n",
+ " edgar/data/910638/0000910638-23-000009.txt | \n",
+ " 8033263900 | \n",
+ " 0000910638 | \n",
+ " rock hill | \n",
+ " 3d systems corp | \n",
+ " 19930816 | \n",
+ " 23738595 | \n",
+ " 1231 | \n",
+ " 10-k | \n",
+ " 3 d systems corp | \n",
+ " 954431352 | \n",
+ " NaN | \n",
+ " 1934 act | \n",
+ " 001-34220 | \n",
+ " services-prepackaged software [7372] | \n",
+ " sc | \n",
+ " de | \n",
+ " 333 three d systems circle | \n",
+ " NaN | \n",
+ " 29730 | \n",
+ " 2023-03-16 | \n",
+ " 2023 | \n",
+ " delaware | \n",
+ " 3d systems corporation | \n",
+ " 3d systems | \n",
+ " T SSTMS | \n",
+ " 150739 | \n",
+ " 910638-0000910638-23-000009 | \n",
+ " 3d systems corporation | \n",
+ " delaware | \n",
+ " NaN | \n",
+ " 2023 | \n",
+ " 3d systems | \n",
+ " T SSTMS | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 7526 | \n",
+ " edgar/data/824142/0000824142-23-000019.txt | \n",
+ " 9185832266 | \n",
+ " 0000824142 | \n",
+ " tulsa | \n",
+ " aaon, inc. | \n",
+ " 19920703 | \n",
+ " 23675207 | \n",
+ " 1231 | \n",
+ " 10-k | \n",
+ " aaon inc | \n",
+ " 870448736 | \n",
+ " NaN | \n",
+ " 1934 act | \n",
+ " 000-18953 | \n",
+ " air cond & warm air heating equip & comm & ind... | \n",
+ " ok | \n",
+ " nv | \n",
+ " 2425 south yukon ave. | \n",
+ " NaN | \n",
+ " 74107 | \n",
+ " 2023-02-27 | \n",
+ " 2023 | \n",
+ " nevada | \n",
+ " aaon incorporated | \n",
+ " aaon | \n",
+ " N | \n",
+ " 142821 | \n",
+ " 824142-0000824142-23-000019 | \n",
+ " aaon, inc | \n",
+ " oklahoma | \n",
+ " NaN | \n",
+ " 2023 | \n",
+ " aaon | \n",
+ " N | \n",
+ " [nevada] | \n",
+ " [oklahoma] | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " record_id_sec filename phone_number central_index_key city company_name_raw_sec date_of_name_change film_number fiscal_year_end form_type former_conformed_name irs_number organization_name sec_act sec_file_number standard_industrial_classification state state_of_incorporation street_address street_address_2 zip_code report_date report_year_sec loc_of_incorporation_sec company_name company_name_no_legal_sec company_name_mphone_sec record_id_ex21 id company_name_raw_ex21 loc_of_incorporation_ex21 own_per report_year_ex21 company_name_no_legal_ex21 company_name_mphone_ex21 loc_tokens_sec loc_tokens_ex21 loc_overlap\n",
+ "0 7990 edgar/data/910638/0000910638-23-000009.txt 8033263900 0000910638 rock hill 3d systems corp 19930816 23738595 1231 10-k 3 d systems corp 954431352 NaN 1934 act 001-34220 services-prepackaged software [7372] sc de 333 three d systems circle NaN 29730 2023-03-16 2023 delaware 3d systems corporation 3d systems T SSTMS 150739 910638-0000910638-23-000009 3d systems corporation delaware NaN 2023 3d systems T SSTMS [delaware] [delaware] 1\n",
+ "1 7526 edgar/data/824142/0000824142-23-000019.txt 9185832266 0000824142 tulsa aaon, inc. 19920703 23675207 1231 10-k aaon inc 870448736 NaN 1934 act 000-18953 air cond & warm air heating equip & comm & ind... ok nv 2425 south yukon ave. NaN 74107 2023-02-27 2023 nevada aaon incorporated aaon N 142821 824142-0000824142-23-000019 aaon, inc oklahoma NaN 2023 aaon N [nevada] [oklahoma] 0"
+ ]
+ },
+ "execution_count": 201,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "closest_match.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 241,
+ "id": "78dfc42c-3921-444e-8342-d34fc2fd1a7a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ex21_with_cik = ex21_df.merge(\n",
+ " closest_match[[\"company_name\", \"central_index_key\", \"loc_of_incorporation_ex21\"]].rename(columns={\"loc_of_incorporation_ex21\": \"loc_of_incorporation\"}),\n",
+ " how=\"left\",\n",
+ " on=[\"company_name\", \"loc_of_incorporation\"],\n",
+ ").rename(columns={\"central_index_key\": \"subsidiary_cik\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 242,
+ "id": "1f4bca08-3a65-484d-ac6b-cb7d4584b4e7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ex21_with_cik = ex21_with_cik.merge(closest_match[[\"company_name\", \"central_index_key\"]],\n",
+ " how=\"left\",\n",
+ " on=\"company_name\"\n",
+ " ).rename(columns={\"central_index_key\": \"company_name_merge_cik\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 243,
+ "id": "5462d9bb-23dd-45fb-b5bf-35396caba399",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "subsidiary_cik\n",
+ "True 191387\n",
+ "False 480\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 243,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ex21_with_cik.subsidiary_cik.isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 244,
+ "id": "a38c45ad-56f3-49ad-bd62-fb91c4d89940",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# if a subsidiary doesn't have a CIK and has a null location\n",
+ "# but its name was assigned a CIK (with a different location)\n",
+ "# then assign that CIK to the subsidiary\n",
+ "ex21_with_cik[\"subsidiary_cik\"] = ex21_with_cik[\"subsidiary_cik\"].where(\n",
+ " ~(ex21_with_cik.subsidiary_cik.isnull()) | ~(ex21_with_cik.loc_of_incorporation.isnull()), \n",
+ " ex21_with_cik[\"company_name_merge_cik\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 245,
+ "id": "4cca9da1-8371-4b45-b88d-8c2911209707",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "subsidiary_cik\n",
+ "True 191386\n",
+ "False 481\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 245,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ex21_with_cik.subsidiary_cik.isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 252,
+ "id": "e5b57a88-ffaa-4834-bea4-c5b4779bd551",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "archive = GCSArchive()\n",
+ "md = archive.get_metadata()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 261,
+ "id": "a33be6e3-056f-4e4a-acd4-9a6dc6f98c90",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ex21_with_cik.loc[:, \"filename\"] = convert_ex21_id_to_filename(ex21_with_cik)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 263,
+ "id": "d0dec8af-d730-4a06-af5e-f390fa228ac8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ex21_with_cik = ex21_with_cik.merge(md[\"cik\"], how=\"left\", left_on=\"filename\", right_index=True).rename(columns={\"cik\": \"parent_cik\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 264,
+ "id": "228a1d4b-bc19-49eb-b557-4f26d1febbd9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ex21_with_cik = add_sec_company_id_to_subsidiaries(ex21_with_cik)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 265,
+ "id": "c1b88c44-81d7-4d9d-a2a3-be1b030348bd",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " record_id | \n",
+ " id | \n",
+ " company_name_raw | \n",
+ " loc_of_incorporation | \n",
+ " own_per | \n",
+ " report_year | \n",
+ " company_name | \n",
+ " company_name_no_legal | \n",
+ " company_name_mphone | \n",
+ " subsidiary_cik | \n",
+ " company_name_merge_cik | \n",
+ " filename | \n",
+ " parent_cik | \n",
+ " sec_company_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 164482 | \n",
+ " 164482 | \n",
+ " 1000045-0000950170-23-030037 | \n",
+ " nicholas data services, inc | \n",
+ " florida | \n",
+ " 100.0 | \n",
+ " 2023 | \n",
+ " nicholas data services incorporated | \n",
+ " nicholas data services | \n",
+ " NXLS TT SRFSS | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " edgar/data/1000045/0000950170-23-030037.txt | \n",
+ " 1000045 | \n",
+ " 1000045_1 | \n",
+ "
\n",
+ " \n",
+ " 164481 | \n",
+ " 164481 | \n",
+ " 1000045-0000950170-23-030037 | \n",
+ " nicholas financial, inc | \n",
+ " florida | \n",
+ " 100.0 | \n",
+ " 2023 | \n",
+ " nicholas financial incorporated | \n",
+ " nicholas financial | \n",
+ " NXLS FNNXL | \n",
+ " 0001000045 | \n",
+ " 0001000045 | \n",
+ " edgar/data/1000045/0000950170-23-030037.txt | \n",
+ " 1000045 | \n",
+ " 0001000045 | \n",
+ "
\n",
+ " \n",
+ " 89 | \n",
+ " 89 | \n",
+ " 1000209-0000950170-23-007273 | \n",
+ " medallion bank | \n",
+ " utah | \n",
+ " NaN | \n",
+ " 2023 | \n",
+ " medallion bank | \n",
+ " medallion bank | \n",
+ " MTLN BNK | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " edgar/data/1000209/0000950170-23-007273.txt | \n",
+ " 1000209 | \n",
+ " 1000209_1 | \n",
+ "
\n",
+ " \n",
+ " 88 | \n",
+ " 88 | \n",
+ " 1000209-0000950170-23-007273 | \n",
+ " freshstart venture capital corp | \n",
+ " new york | \n",
+ " NaN | \n",
+ " 2023 | \n",
+ " freshstart venture capital corporation | \n",
+ " freshstart venture capital | \n",
+ " FRXSTRT FNTR KPTL | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " edgar/data/1000209/0000950170-23-007273.txt | \n",
+ " 1000209 | \n",
+ " 1000209_2 | \n",
+ "
\n",
+ " \n",
+ " 87 | \n",
+ " 87 | \n",
+ " 1000209-0000950170-23-007273 | \n",
+ " medallion capital, inc | \n",
+ " minnesota | \n",
+ " NaN | \n",
+ " 2023 | \n",
+ " medallion capital incorporated | \n",
+ " medallion capital | \n",
+ " MTLN KPTL | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " edgar/data/1000209/0000950170-23-007273.txt | \n",
+ " 1000209 | \n",
+ " 1000209_3 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 161957 | \n",
+ " 161957 | \n",
+ " 9984-0000009984-23-000060 | \n",
+ " barnes molding solutions korea limited | \n",
+ " korea | \n",
+ " NaN | \n",
+ " 2023 | \n",
+ " barnes molding solutions korea limited | \n",
+ " barnes molding solutions korea | \n",
+ " BRNS MLTNK SLXNS KR | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " edgar/data/9984/0000009984-23-000060.txt | \n",
+ " 9984 | \n",
+ " 9984_99 | \n",
+ "
\n",
+ " \n",
+ " 161956 | \n",
+ " 161956 | \n",
+ " 9984-0000009984-23-000060 | \n",
+ " barnes molding solutions (jiangsu) co., ltd | \n",
+ " china | \n",
+ " NaN | \n",
+ " 2023 | \n",
+ " barnes molding solutions company limited | \n",
+ " barnes molding solutions | \n",
+ " BRNS MLTNK SLXNS | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " edgar/data/9984/0000009984-23-000060.txt | \n",
+ " 9984 | \n",
+ " 9984_100 | \n",
+ "
\n",
+ " \n",
+ " 161955 | \n",
+ " 161955 | \n",
+ " 9984-0000009984-23-000060 | \n",
+ " barnes korea ltd | \n",
+ " korea | \n",
+ " NaN | \n",
+ " 2023 | \n",
+ " barnes korea limited | \n",
+ " barnes korea | \n",
+ " BRNS KR | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " edgar/data/9984/0000009984-23-000060.txt | \n",
+ " 9984 | \n",
+ " 9984_101 | \n",
+ "
\n",
+ " \n",
+ " 161965 | \n",
+ " 161965 | \n",
+ " 9984-0000009984-23-000060 | \n",
+ " gimatic automation india pvt ltd | \n",
+ " india | \n",
+ " NaN | \n",
+ " 2023 | \n",
+ " gimatic automation india pvt limited | \n",
+ " gimatic automation india pvt | \n",
+ " JMTK ATMXN INT PFT | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " edgar/data/9984/0000009984-23-000060.txt | \n",
+ " 9984 | \n",
+ " 9984_102 | \n",
+ "
\n",
+ " \n",
+ " 162018 | \n",
+ " 162018 | \n",
+ " 9984-0000009984-23-000060 | \n",
+ " synventive molding solutions ltda | \n",
+ " brazil | \n",
+ " NaN | \n",
+ " 2023 | \n",
+ " synventive molding solutions ltda | \n",
+ " synventive molding solutions ltda | \n",
+ " SNFNTF MLTNK SLXNS LTT | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " edgar/data/9984/0000009984-23-000060.txt | \n",
+ " 9984 | \n",
+ " 9984_103 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
191867 rows × 14 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " record_id id company_name_raw loc_of_incorporation own_per report_year company_name company_name_no_legal company_name_mphone subsidiary_cik company_name_merge_cik filename parent_cik sec_company_id\n",
+ "164482 164482 1000045-0000950170-23-030037 nicholas data services, inc florida 100.0 2023 nicholas data services incorporated nicholas data services NXLS TT SRFSS NaN NaN edgar/data/1000045/0000950170-23-030037.txt 1000045 1000045_1\n",
+ "164481 164481 1000045-0000950170-23-030037 nicholas financial, inc florida 100.0 2023 nicholas financial incorporated nicholas financial NXLS FNNXL 0001000045 0001000045 edgar/data/1000045/0000950170-23-030037.txt 1000045 0001000045\n",
+ "89 89 1000209-0000950170-23-007273 medallion bank utah NaN 2023 medallion bank medallion bank MTLN BNK NaN NaN edgar/data/1000209/0000950170-23-007273.txt 1000209 1000209_1\n",
+ "88 88 1000209-0000950170-23-007273 freshstart venture capital corp new york NaN 2023 freshstart venture capital corporation freshstart venture capital FRXSTRT FNTR KPTL NaN NaN edgar/data/1000209/0000950170-23-007273.txt 1000209 1000209_2\n",
+ "87 87 1000209-0000950170-23-007273 medallion capital, inc minnesota NaN 2023 medallion capital incorporated medallion capital MTLN KPTL NaN NaN edgar/data/1000209/0000950170-23-007273.txt 1000209 1000209_3\n",
+ "... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n",
+ "161957 161957 9984-0000009984-23-000060 barnes molding solutions korea limited korea NaN 2023 barnes molding solutions korea limited barnes molding solutions korea BRNS MLTNK SLXNS KR NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_99\n",
+ "161956 161956 9984-0000009984-23-000060 barnes molding solutions (jiangsu) co., ltd china NaN 2023 barnes molding solutions company limited barnes molding solutions BRNS MLTNK SLXNS NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_100\n",
+ "161955 161955 9984-0000009984-23-000060 barnes korea ltd korea NaN 2023 barnes korea limited barnes korea BRNS KR NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_101\n",
+ "161965 161965 9984-0000009984-23-000060 gimatic automation india pvt ltd india NaN 2023 gimatic automation india pvt limited gimatic automation india pvt JMTK ATMXN INT PFT NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_102\n",
+ "162018 162018 9984-0000009984-23-000060 synventive molding solutions ltda brazil NaN 2023 synventive molding solutions ltda synventive molding solutions ltda SNFNTF MLTNK SLXNS LTT NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_103\n",
+ "\n",
+ "[191867 rows x 14 columns]"
+ ]
+ },
+ "execution_count": 265,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ex21_with_cik"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 266,
+ "id": "192d3cac-b156-4e5c-8148-0cbdc3e8900d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ex21_with_cik.to_parquet(\"ex21_2023.parquet\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1bb694c9-cfbd-4e2f-b69c-9996a588d2d2",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "# Match Ex. 21 Subsidiaries to a SEC filer"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "01d3a5e1-ad17-4266-b2ef-358f246749db",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Preprocessing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "id": "4df63893-8a18-4b00-9b16-d036108bd567",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " state | \n",
+ " state_of_incorporation | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " ny | \n",
+ " de | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ny | \n",
+ " de | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " ca | \n",
+ " md | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " ga | \n",
+ " de | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " nj | \n",
+ " de | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 8265 | \n",
+ " ny | \n",
+ " de | \n",
+ "
\n",
+ " \n",
+ " 8266 | \n",
+ " tx | \n",
+ " de | \n",
+ "
\n",
+ " \n",
+ " 8267 | \n",
+ " ny | \n",
+ " oh | \n",
+ "
\n",
+ " \n",
+ " 8268 | \n",
+ " tx | \n",
+ " de | \n",
+ "
\n",
+ " \n",
+ " 8269 | \n",
+ " ct | \n",
+ " de | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5051 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " state state_of_incorporation\n",
+ "1 ny de\n",
+ "2 ny de\n",
+ "5 ca md\n",
+ "6 ga de\n",
+ "7 nj de\n",
+ "... ... ...\n",
+ "8265 ny de\n",
+ "8266 tx de\n",
+ "8267 ny oh\n",
+ "8268 tx de\n",
+ "8269 ct de\n",
+ "\n",
+ "[5051 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 85,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_df[(sec_df[\"state\"] != sec_df[\"state_of_incorporation\"]) & (~sec_df[\"state_of_incorporation\"].isnull())][[\"state\", \"state_of_incorporation\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "24890018-8efb-445f-ad91-ca316edccbe8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sec_match_df = sec_df.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "83f859df-1764-4e97-addc-0064bdcb31b7",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "loc_of_incorporation\n",
+ "False 6382\n",
+ "True 749\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_match_df[\"loc_of_incorporation\"].isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "e9d0828f-0ad8-41ea-a449-ddd274a888d0",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "ex21_match_df = ex21_df.copy()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ef3f01c7-c21e-4755-ac99-4ea01f359c43",
+ "metadata": {},
+ "source": [
+ "Remove clearly \"invalid\" strings and fill nulls"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "id": "4ca07927-185d-4bc6-978a-e8788a8f77b3",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "company_name\n",
+ "rush truck center 120\n",
+ "encompass health rehabilitation hospital 79\n",
+ "rush peterbilt truck center 57\n",
+ "branch 52\n",
+ "sci funeral services llc iowa limited liability company 33\n",
+ "partnership limited partnership 32\n",
+ "alderwoods group llc de limited liability company 27\n",
+ "encompass health rehabilitation hospital of 26\n",
+ "u haul co of 26\n",
+ "at and t 25\n",
+ "corporation 21\n",
+ "amh portfolio management 20\n",
+ "rush bus center 20\n",
+ "limited partnership limited partnership 18\n",
+ "therapy limited partnership 15\n",
+ "rush isuzu trucks 15\n",
+ "colgate palmolive limited 14\n",
+ "johnson and johnson limited 11\n",
+ "ecolab limited 11\n",
+ "rush truck centres 11\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 71,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ex21_match_df.company_name.value_counts().head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "id": "8a4839e5-a2e5-4098-826a-4d340cdde638",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "ex21_match_df = ex21_match_df[[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]\n",
+ "sec_match_df = sec_match_df[[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "id": "baab7dfc-4efb-4c08-b090-32dd47025e15",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/c0/5zrbrqhx17d5jm6t03bw2nkw0000gn/T/ipykernel_26291/3959766958.py:2: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " sec_match_df.loc[:, \"company_name_mphone_list\"] = sec_match_df[\"company_name_mphone\"].str.split(\" \")\n",
+ "/var/folders/c0/5zrbrqhx17d5jm6t03bw2nkw0000gn/T/ipykernel_26291/3959766958.py:3: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " ex21_match_df.loc[:, \"company_name_mphone_list\"] = ex21_match_df[\"company_name_mphone\"].str.split(\" \")\n"
+ ]
+ }
+ ],
+ "source": [
+ "# TEMP\n",
+ "sec_match_df.loc[:, \"company_name_mphone_list\"] = sec_match_df[\"company_name_mphone\"].str.split(\" \")\n",
+ "ex21_match_df.loc[:, \"company_name_mphone_list\"] = ex21_match_df[\"company_name_mphone\"].str.split(\" \")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "id": "a1a6634e-e554-4a94-8a57-c2755048db22",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sec_match_df.loc[:, \"loc_list\"] = sec_match_df[\"loc_of_incorporation\"].str.replace(\",\", '').str.split(\" \")\n",
+ "ex21_match_df.loc[:, \"loc_list\"] = ex21_match_df[\"loc_of_incorporation\"].str.replace(\",\", '').str.split(\" \")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c294372b-159c-4c90-a031-61c34532b965",
+ "metadata": {},
+ "source": [
+ "## Exploratory Analysis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "id": "c9dbc620-ed49-4a8e-9d02-6b6f2e0a14cf",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "from splink.exploratory import completeness_chart, profile_columns\n",
+ "from splink import DuckDBAPI\n",
+ "\n",
+ "db_api = DuckDBAPI()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "id": "422ca098-e4e7-4284-8b04-74e976e36023",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "match_cols = [\"report_year\", \"company_name\", \"loc_of_incorporation\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "id": "232b5718-c1ed-4e63-8384-b4acf33210d3",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.LayerChart(...)"
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# sometimes this will show up as 100% complete in loc_of_incorporation, not sure why\n",
+ "completeness_chart([ex21_match_df[match_cols], sec_match_df[match_cols]], db_api=db_api)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6b6b20bc-cd22-42cc-b24d-8d581a311ca8",
+ "metadata": {},
+ "source": [
+ "There is strong skew in the location of incorporation field with around 40-50% of the values being Delaware in both datasets. We therefore want to use `term_frequency_adjustments` in our linkage model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "id": "a5c26016-2c59-4335-bd39-8b2e7ea91840",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.VConcatChart(...)"
+ ]
+ },
+ "execution_count": 78,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "profile_columns(ex21_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "id": "2a57f717-140f-434d-8998-983b8bf38ac5",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.VConcatChart(...)"
+ ]
+ },
+ "execution_count": 79,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "profile_columns(sec_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1f258250-97c1-4f19-b535-cb91ff9e0ea9",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Blocking\n",
+ "\n",
+ "Can maybe do the subsidiary to filers match without blocking but probably want a blocking rule. \n",
+ "\n",
+ "TODO: can we block on nearest 5 report years instead of exact match report year?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "fb6d143b-5201-4b31-849c-97db80781ade",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "from splink import block_on\n",
+ "from splink.blocking_analysis import count_comparisons_from_blocking_rule, n_largest_blocks"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "id": "22766c9f-7371-483f-82b0-015549a84357",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "br = \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "id": "60937a9c-dff6-4d68-808f-81b8228fc9f6",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'number_of_comparisons_generated_pre_filter_conditions': 531298,\n",
+ " 'number_of_comparisons_to_be_scored_post_filter_conditions': 531298,\n",
+ " 'filter_conditions_identified': '',\n",
+ " 'equi_join_conditions_identified': 'SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n",
+ " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}"
+ ]
+ },
+ "execution_count": 81,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# br0 = block_on(\"report_year\", \"report_year\")\n",
+ "# br1 = \"jaccard(l.company_name, r.company_name) < .1\"\n",
+ "# br2 = block_on(\"company_name\", \"company_name\")\n",
+ "\n",
+ "counts = count_comparisons_from_blocking_rule(\n",
+ " table_or_tables=[sec_match_df, ex21_match_df],\n",
+ " blocking_rule=br,\n",
+ " link_type=\"link_only\",\n",
+ " unique_id_column_name='record_id',\n",
+ " db_api=db_api,\n",
+ ")\n",
+ "\n",
+ "counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "id": "67717313-2c17-4b6b-b984-8f7bc955c678",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " key_0 | \n",
+ " count_l | \n",
+ " count_r | \n",
+ " block_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " AMRK | \n",
+ " 56 | \n",
+ " 625 | \n",
+ " 35000 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " FRST | \n",
+ " 56 | \n",
+ " 555 | \n",
+ " 31080 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " INTR | \n",
+ " 30 | \n",
+ " 659 | \n",
+ " 19770 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " key_0 count_l count_r block_count\n",
+ "0 AMRK 56 625 35000\n",
+ "1 FRST 56 555 31080\n",
+ "2 INTR 30 659 19770"
+ ]
+ },
+ "execution_count": 82,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "result = n_largest_blocks(\n",
+ " table_or_tables=[sec_match_df, ex21_match_df],\n",
+ " blocking_rule=br,\n",
+ " link_type=\"link_only\",\n",
+ " db_api=db_api,\n",
+ " n_largest=3\n",
+ ")\n",
+ "\n",
+ "result.as_pandas_dataframe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "id": "6fe6fb99-f5fd-4538-a8bc-c9dd41f4ff9c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 84,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink.blocking_analysis import (\n",
+ " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
+ ")\n",
+ "\n",
+ "blocking_rules_for_analysis = [\n",
+ " br\n",
+ "]\n",
+ "\n",
+ "\n",
+ "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
+ " table_or_tables=[sec_match_df, ex21_match_df],\n",
+ " blocking_rules=blocking_rules_for_analysis,\n",
+ " db_api=db_api,\n",
+ " unique_id_column_name='record_id',\n",
+ " link_type=\"link_only\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b553f3fb-0661-46ab-b43c-f5fcba608a09",
+ "metadata": {},
+ "source": [
+ "## Create Model\n",
+ "\n",
+ "Maybe want to deduplicate the Ex. 21 data first, then conduct a link to SEC filers?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "1f12d114-22fd-4f12-a0be-6a62500e80d5",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import splink.comparison_library as cl\n",
+ "from splink import Linker, SettingsCreator"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 120,
+ "id": "e9cf27ac-6f65-4c73-9e11-9445a8977531",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Comparison 'ExactMatch' of \"company_name\".\n",
+ "Similarity is assessed using the following ComparisonLevels:\n",
+ " - 'company_name is NULL' with SQL rule: \"company_name_l\" IS NULL OR \"company_name_r\" IS NULL\n",
+ " - 'Exact match on company_name' with SQL rule: \"company_name_l\" = \"company_name_r\"\n",
+ " - 'All other comparisons' with SQL rule: ELSE\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "company_name_comparison = cl.ExactMatch(\n",
+ " \"company_name\",\n",
+ ")\n",
+ "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "id": "a0d056b4-b7b5-4f01-ad60-3ffc2bec54eb",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Comparison 'LevenshteinAtThresholds' of \"company_name\".\n",
+ "Similarity is assessed using the following ComparisonLevels:\n",
+ " - 'company_name is NULL' with SQL rule: \"company_name_l\" IS NULL OR \"company_name_r\" IS NULL\n",
+ " - 'Exact match on company_name' with SQL rule: \"company_name_l\" = \"company_name_r\"\n",
+ " - 'Levenshtein distance of company_name <= 1' with SQL rule: levenshtein(\"company_name_l\", \"company_name_r\") <= 1\n",
+ " - 'All other comparisons' with SQL rule: ELSE\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "company_name_comparison = cl.LevenshteinAtThresholds(\n",
+ " \"company_name\",\n",
+ " distance_threshold_or_thresholds=[1]\n",
+ ")\n",
+ "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "id": "bf199c98-5239-4a1e-8856-19d74e42b7db",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Comparison 'ArrayIntersectAtSizes' of \"company_name_mphone_list\".\n",
+ "Similarity is assessed using the following ComparisonLevels:\n",
+ " - 'company_name_mphone_list is NULL' with SQL rule: \"company_name_mphone_list_l\" IS NULL OR \"company_name_mphone_list_r\" IS NULL\n",
+ " - 'Array intersection size >= 3' with SQL rule: array_length(list_intersect(\"company_name_mphone_list_l\", \"company_name_mphone_list_r\")) >= 3\n",
+ " - 'Array intersection size >= 2' with SQL rule: array_length(list_intersect(\"company_name_mphone_list_l\", \"company_name_mphone_list_r\")) >= 2\n",
+ " - 'Array intersection size >= 1' with SQL rule: array_length(list_intersect(\"company_name_mphone_list_l\", \"company_name_mphone_list_r\")) >= 1\n",
+ " - 'All other comparisons' with SQL rule: ELSE\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "company_name_comparison = cl.ArrayIntersectAtSizes(\n",
+ " \"company_name_mphone_list\",\n",
+ " size_threshold_or_thresholds=[3,2,1]\n",
+ ")\n",
+ "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 122,
+ "id": "7d2697d3-efdb-4be4-8911-18b457f5bab4",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Comparison 'JaroWinklerAtThresholds' of \"loc_of_incorporation\".\n",
+ "Similarity is assessed using the following ComparisonLevels:\n",
+ " - 'loc_of_incorporation is NULL' with SQL rule: \"loc_of_incorporation_l\" IS NULL OR \"loc_of_incorporation_r\" IS NULL\n",
+ " - 'Exact match on loc_of_incorporation' with SQL rule: \"loc_of_incorporation_l\" = \"loc_of_incorporation_r\"\n",
+ " - 'Jaro-Winkler distance of loc_of_incorporation >= 0.9' with SQL rule: jaro_winkler_similarity(\"loc_of_incorporation_l\", \"loc_of_incorporation_r\") >= 0.9\n",
+ " - 'All other comparisons' with SQL rule: ELSE\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# try with Levenshtein too\n",
+ "location_comparison = cl.JaroWinklerAtThresholds(\n",
+ " \"loc_of_incorporation\",\n",
+ " score_threshold_or_thresholds=[0.9]\n",
+ ")\n",
+ "print(location_comparison.get_comparison(\"duckdb\").human_readable_description)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 121,
+ "id": "f3529a5a-7ced-46dd-af22-7bb44ed92aa2",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Comparison 'ArrayIntersectAtSizes' of \"loc_list\".\n",
+ "Similarity is assessed using the following ComparisonLevels:\n",
+ " - 'loc_list is NULL' with SQL rule: \"loc_list_l\" IS NULL OR \"loc_list_r\" IS NULL\n",
+ " - 'Array intersection size >= 2' with SQL rule: array_length(list_intersect(\"loc_list_l\", \"loc_list_r\")) >= 2\n",
+ " - 'Array intersection size >= 1' with SQL rule: array_length(list_intersect(\"loc_list_l\", \"loc_list_r\")) >= 1\n",
+ " - 'All other comparisons' with SQL rule: ELSE\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "location_comparison = cl.ArrayIntersectAtSizes(\n",
+ " \"loc_list\",\n",
+ " size_threshold_or_thresholds=[2,1]\n",
+ ")\n",
+ "print(location_comparison.get_comparison(\"duckdb\").human_readable_description)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 123,
+ "id": "92c1ad6b-4516-4ab4-90eb-394669c4a02b",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "settings = SettingsCreator(\n",
+ " link_type=\"link_only\",\n",
+ " unique_id_column_name=\"record_id\",\n",
+ " comparisons=[\n",
+ " company_name_comparison,\n",
+ " location_comparison.configure(term_frequency_adjustments=True)\n",
+ " ],\n",
+ " blocking_rules_to_generate_predictions=[\n",
+ " br\n",
+ " ],\n",
+ " retain_intermediate_calculation_columns=True,\n",
+ ")\n",
+ "\n",
+ "linker = Linker([sec_match_df, ex21_match_df], settings, db_api=DuckDBAPI())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2f293657-b40c-4539-8abd-8524d11c39c0",
+ "metadata": {},
+ "source": [
+ "Estimate probability two random records match"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 124,
+ "id": "e9eb59b9-49cc-45b7-8ffa-b8f7e5372608",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d8daffbf12a14f72a247e47fc2fa719a",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Probability two random records match is estimated to be 8.21e-05.\n",
+ "This means that amongst all possible pairwise record comparisons, one in 12,184.39 are expected to match. With 1,368,717,009 total possible comparisons, we expect a total of around 112,333.68 matching pairs\n"
+ ]
+ }
+ ],
+ "source": [
+ "deterministic_rules = [\n",
+ " block_on(\"company_name_mphone\", \"company_name_mphone\"),\n",
+ " \"jaccard(r.company_name, l.company_name) >= .95 and l.loc_of_incorporation = r.loc_of_incorporation\",\n",
+ " \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and jaccard(r.company_name, l.company_name) >= .95\",\n",
+ " # \"substr(l.company_name_mphone,1,5) = substr(r.company_name_mphone,1,5) and l.loc_of_incorporation = r.loc_of_incorporation\"\n",
+ "]\n",
+ "\n",
+ "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.95)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 125,
+ "id": "5117653e-e72b-4c13-b923-d1228b39d357",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "----- Estimating u probabilities using random sampling -----\n",
+ "\n",
+ "Estimated u probabilities using random sampling\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - company_name (no m values are trained).\n",
+ " - loc_of_incorporation (no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "linker.training.estimate_u_using_random_sampling(max_pairs=1e7)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 126,
+ "id": "8b089a0d-4c91-4b4d-9806-ed83c9bd59b9",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "(l.\"company_name_mphone\" = r.\"company_name_mphone\") AND (l.\"company_name_mphone\" = r.\"company_name_mphone\")\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - company_name\n",
+ " - loc_of_incorporation\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.38 in the m_probability of loc_of_incorporation, level `Exact match on loc_of_incorporation`\n",
+ "Iteration 2: Largest change in params was 0.027 in the m_probability of loc_of_incorporation, level `All other comparisons`\n",
+ "Iteration 3: Largest change in params was -0.000274 in the m_probability of company_name, level `Exact match on company_name`\n",
+ "Iteration 4: Largest change in params was -0.00056 in the m_probability of company_name, level `Exact match on company_name`\n",
+ "Iteration 5: Largest change in params was 0.00112 in the m_probability of company_name, level `All other comparisons`\n",
+ "Iteration 6: Largest change in params was 0.00214 in the m_probability of company_name, level `All other comparisons`\n",
+ "Iteration 7: Largest change in params was 0.00387 in the m_probability of company_name, level `All other comparisons`\n",
+ "Iteration 8: Largest change in params was -0.00648 in the m_probability of company_name, level `Exact match on company_name`\n",
+ "Iteration 9: Largest change in params was 0.00989 in the m_probability of company_name, level `All other comparisons`\n",
+ "Iteration 10: Largest change in params was 0.0137 in the m_probability of company_name, level `All other comparisons`\n",
+ "Iteration 11: Largest change in params was 0.0171 in the m_probability of company_name, level `All other comparisons`\n",
+ "Iteration 12: Largest change in params was -0.0197 in the m_probability of company_name, level `Exact match on company_name`\n",
+ "Iteration 13: Largest change in params was 0.0209 in the m_probability of company_name, level `All other comparisons`\n",
+ "Iteration 14: Largest change in params was -0.0209 in the m_probability of company_name, level `Exact match on company_name`\n",
+ "Iteration 15: Largest change in params was -0.0201 in the m_probability of company_name, level `Exact match on company_name`\n",
+ "Iteration 16: Largest change in params was -0.0187 in the m_probability of company_name, level `Exact match on company_name`\n",
+ "Iteration 17: Largest change in params was -0.017 in the m_probability of company_name, level `Exact match on company_name`\n",
+ "Iteration 18: Largest change in params was 0.0153 in the m_probability of company_name, level `All other comparisons`\n",
+ "Iteration 19: Largest change in params was -0.0136 in the m_probability of company_name, level `Exact match on company_name`\n",
+ "Iteration 20: Largest change in params was -0.0121 in the m_probability of company_name, level `Exact match on company_name`\n",
+ "Iteration 21: Largest change in params was -0.0107 in the m_probability of company_name, level `Exact match on company_name`\n",
+ "Iteration 22: Largest change in params was -0.0094 in the m_probability of company_name, level `Exact match on company_name`\n",
+ "Iteration 23: Largest change in params was 0.00828 in the m_probability of company_name, level `All other comparisons`\n",
+ "Iteration 24: Largest change in params was -0.00728 in the m_probability of company_name, level `Exact match on company_name`\n",
+ "Iteration 25: Largest change in params was -0.00641 in the m_probability of company_name, level `Exact match on company_name`\n",
+ "\n",
+ "EM converged after 25 iterations\n",
+ "\n",
+ "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
+ ]
+ }
+ ],
+ "source": [
+ "training_blocking_rule = block_on(\"company_name_mphone\", \"company_name_mphone\")\n",
+ "training_session_fname_sname = (\n",
+ " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 127,
+ "id": "88e058bc-800d-4da4-92aa-6ddb7377b4bf",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.VConcatChart(...)"
+ ]
+ },
+ "execution_count": 127,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.match_weights_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 128,
+ "id": "673a4776-1de1-46ce-a411-f7fd1668d54f",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.HConcatChart(...)"
+ ]
+ },
+ "execution_count": 128,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.m_u_parameters_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 107,
+ "id": "ebf9e326-38f1-4d78-b302-15867cda1009",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "settings = linker.misc.save_model_to_json(\n",
+ " \"../sec_ex21_model_settings/2023_model.json\", overwrite=True\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a14055d2-6761-4906-8555-35c92553a0e9",
+ "metadata": {},
+ "source": [
+ "Log model in MLFlow."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dfe4feca-e694-4ec6-a5b0-11382c740516",
+ "metadata": {},
+ "source": [
+ "## Make predictions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 129,
+ "id": "72ff6575-68e3-4256-8253-85eb2564501f",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Blocking time: 0.20 seconds\n",
+ "Predict time: 0.12 seconds\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_predictions = linker.inference.predict(threshold_match_probability=0.5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 130,
+ "id": "24e14675-11cf-4c46-a592-7733326113d2",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "preds_df = df_predictions.as_pandas_dataframe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 131,
+ "id": "d50332a5-a8dc-444b-be92-b9d29f73763e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preds_df = preds_df.merge(sec_df[[\"record_id\", \"company_name_raw\"]], how=\"left\", left_on=\"record_id_l\", right_on=\"record_id\").rename(columns={\"company_name_raw\": \"company_name_sec\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 132,
+ "id": "fddbed17-3d71-4c85-95d5-c3d0fd517f9d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preds_df = preds_df.merge(ex21_df[[\"record_id\", \"company_name_raw\"]], how=\"left\", left_on=\"record_id_r\", right_on=\"record_id\").rename(columns={\"company_name_raw\": \"company_name_ex21\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 133,
+ "id": "3d733c2a-7004-4ce8-8d3f-25ed1e720c36",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " source_dataset_l | \n",
+ " source_dataset_r | \n",
+ " record_id_l | \n",
+ " record_id_r | \n",
+ " company_name_l | \n",
+ " company_name_r | \n",
+ " gamma_company_name | \n",
+ " bf_company_name | \n",
+ " loc_of_incorporation_l | \n",
+ " loc_of_incorporation_r | \n",
+ " gamma_loc_of_incorporation | \n",
+ " tf_loc_of_incorporation_l | \n",
+ " tf_loc_of_incorporation_r | \n",
+ " bf_loc_of_incorporation | \n",
+ " bf_tf_adj_loc_of_incorporation | \n",
+ " company_name_mphone_l | \n",
+ " company_name_mphone_r | \n",
+ " record_id_x | \n",
+ " company_name_sec | \n",
+ " record_id_y | \n",
+ " company_name_ex21 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 8180 | \n",
+ " 159390 | \n",
+ " national instruments corporation | \n",
+ " national instruments corporation | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " republic of korea | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.000234 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " NXNL INSTRMNTS | \n",
+ " NXNL INSTRMNTS | \n",
+ " 8180 | \n",
+ " national instruments corp | \n",
+ " 159390 | \n",
+ " national instruments (korea) corporation | \n",
+ "
\n",
+ " \n",
+ " 176 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6034 | \n",
+ " 107265 | \n",
+ " afternext healthtech acquisition corporation | \n",
+ " afternext healthtech acquisition corporation | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " e9 | \n",
+ " cayman islands | \n",
+ " 0 | \n",
+ " 0.001069 | \n",
+ " 0.015387 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " AFTRNKST HL0TX AKKSXN | \n",
+ " AFTRNKST HL0TX AKKSXN | \n",
+ " 6034 | \n",
+ " afternext healthtech acquisition corp. | \n",
+ " 107265 | \n",
+ " afternext healthtech acquisition corp | \n",
+ "
\n",
+ " \n",
+ " 178 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6799 | \n",
+ " 117610 | \n",
+ " gap incorporated | \n",
+ " gap incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " puerto rico | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.001548 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " KP | \n",
+ " KP | \n",
+ " 6799 | \n",
+ " gap inc | \n",
+ " 117610 | \n",
+ " gap (puerto rico), inc | \n",
+ "
\n",
+ " \n",
+ " 183 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 5811 | \n",
+ " 170135 | \n",
+ " rockley photonics holdings limited | \n",
+ " rockley photonics holdings limited | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " e9 | \n",
+ " cayman islands | \n",
+ " 0 | \n",
+ " 0.001069 | \n",
+ " 0.015387 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " RKL FTNKS HLTNKS | \n",
+ " RKL FTNKS HLTNKS | \n",
+ " 5811 | \n",
+ " rockley photonics holdings ltd | \n",
+ " 170135 | \n",
+ " rockley photonics holdings limited | \n",
+ "
\n",
+ " \n",
+ " 184 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6799 | \n",
+ " 117608 | \n",
+ " gap incorporated | \n",
+ " gap incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " california | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.015978 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " KP | \n",
+ " KP | \n",
+ " 6799 | \n",
+ " gap inc | \n",
+ " 117608 | \n",
+ " gap (itm) inc | \n",
+ "
\n",
+ " \n",
+ " 186 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6799 | \n",
+ " 117605 | \n",
+ " gap incorporated | \n",
+ " gap incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " canada | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.012191 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " KP | \n",
+ " KP | \n",
+ " 6799 | \n",
+ " gap inc | \n",
+ " 117605 | \n",
+ " gap (canada) inc | \n",
+ "
\n",
+ " \n",
+ " 412 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1524 | \n",
+ " 165843 | \n",
+ " aircastle limited | \n",
+ " aircastle limited | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " d0 | \n",
+ " ireland | \n",
+ " 0 | \n",
+ " 0.000150 | \n",
+ " 0.008315 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " ARKSTL | \n",
+ " ARKSTL | \n",
+ " 1524 | \n",
+ " aircastle ltd | \n",
+ " 165843 | \n",
+ " aircastle (ireland) limited | \n",
+ "
\n",
+ " \n",
+ " 189 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6753 | \n",
+ " 115383 | \n",
+ " arthur j gallagher and company | \n",
+ " arthur j gallagher and company | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " illinois | \n",
+ " delaware | \n",
+ " 0 | \n",
+ " 0.006115 | \n",
+ " 0.372842 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " AR0R J KLKHR ANT | \n",
+ " AR0R J KLKHR ANT | \n",
+ " 6753 | \n",
+ " arthur j. gallagher & co. | \n",
+ " 115383 | \n",
+ " arthur j. gallagher & co | \n",
+ "
\n",
+ " \n",
+ " 193 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6651 | \n",
+ " 110797 | \n",
+ " flowserve corporation | \n",
+ " flowserve corporation | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " new york | \n",
+ " mauritius | \n",
+ " 0 | \n",
+ " 0.009913 | \n",
+ " 0.001075 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " FLSRF | \n",
+ " FLSRF | \n",
+ " 6651 | \n",
+ " flowserve corp | \n",
+ " 110797 | \n",
+ " flowserve (mauritius) corporation | \n",
+ "
\n",
+ " \n",
+ " 406 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 578 | \n",
+ " 24844 | \n",
+ " united parcel service incorporated | \n",
+ " united parcel service incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " ohio | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.008136 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " UNTT PRSL SRFS | \n",
+ " UNTT PRSL SRFS | \n",
+ " 578 | \n",
+ " united parcel service inc | \n",
+ " 24844 | \n",
+ " united parcel service, inc | \n",
+ "
\n",
+ " \n",
+ " 198 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 5812 | \n",
+ " 171905 | \n",
+ " nextracker incorporated | \n",
+ " nextracker incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " united states delaware | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.002278 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " NKSTRKR | \n",
+ " NKSTRKR | \n",
+ " 5812 | \n",
+ " nextracker inc. | \n",
+ " 171905 | \n",
+ " nextracker inc | \n",
+ "
\n",
+ " \n",
+ " 199 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 5843 | \n",
+ " 51850 | \n",
+ " sculptor acquisition corp i | \n",
+ " sculptor acquisition corp i | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " e9 | \n",
+ " cayman islands | \n",
+ " 0 | \n",
+ " 0.001069 | \n",
+ " 0.015387 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " SKLPTR AKKSXN I | \n",
+ " SKLPTR AKKSXN I | \n",
+ " 5843 | \n",
+ " sculptor acquisition corp i | \n",
+ " 51850 | \n",
+ " sculptor acquisition corp i | \n",
+ "
\n",
+ " \n",
+ " 174 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 7095 | \n",
+ " 179994 | \n",
+ " cintas corporation | \n",
+ " cintas corporation | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " washington | \n",
+ " nevada | \n",
+ " 0 | \n",
+ " 0.002996 | \n",
+ " 0.014652 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " SNTS | \n",
+ " SNTS | \n",
+ " 7095 | \n",
+ " cintas corp | \n",
+ " 179994 | \n",
+ " cintas corporation | \n",
+ "
\n",
+ " \n",
+ " 405 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 285 | \n",
+ " 12641 | \n",
+ " onespan incorporated | \n",
+ " onespan incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " usa, state of delaware | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.000011 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " ONSPN | \n",
+ " ONSPN | \n",
+ " 285 | \n",
+ " onespan inc. | \n",
+ " 12641 | \n",
+ " onespan inc | \n",
+ "
\n",
+ " \n",
+ " 207 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6282 | \n",
+ " 97173 | \n",
+ " mars acquisition corporation | \n",
+ " mars acquisition corporation | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " e9 | \n",
+ " delaware | \n",
+ " 0 | \n",
+ " 0.001069 | \n",
+ " 0.372842 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " MRS AKKSXN | \n",
+ " MRS AKKSXN | \n",
+ " 6282 | \n",
+ " mars acquisition corp. | \n",
+ " 97173 | \n",
+ " mars acquisition corp | \n",
+ "
\n",
+ " \n",
+ " 212 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 4834 | \n",
+ " 97747 | \n",
+ " viatris incorporated | \n",
+ " viatris incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " philippines | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.001927 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " FTRS | \n",
+ " FTRS | \n",
+ " 4834 | \n",
+ " viatris inc | \n",
+ " 97747 | \n",
+ " viatris, inc | \n",
+ "
\n",
+ " \n",
+ " 397 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1205 | \n",
+ " 35911 | \n",
+ " turning point brands incorporated | \n",
+ " turning point brands incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " ontario, canada | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.000852 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " TRNNK PNT BRNTS | \n",
+ " TRNNK PNT BRNTS | \n",
+ " 1205 | \n",
+ " turning point brands, inc. | \n",
+ " 35911 | \n",
+ " turning point brands (canada) inc | \n",
+ "
\n",
+ " \n",
+ " 396 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1171 | \n",
+ " 35941 | \n",
+ " clearpoint neuro incorporated | \n",
+ " clearpoint neuro incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " canada new brunswick | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.000006 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " KLRPNT NR | \n",
+ " KLRPNT NR | \n",
+ " 1171 | \n",
+ " clearpoint neuro, inc. | \n",
+ " 35941 | \n",
+ " clearpoint neuro (canada) inc | \n",
+ "
\n",
+ " \n",
+ " 393 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1765 | \n",
+ " 51537 | \n",
+ " genpact limited | \n",
+ " genpact limited | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " d0 | \n",
+ " united kingdom | \n",
+ " 0 | \n",
+ " 0.000150 | \n",
+ " 0.031521 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " JNPKT | \n",
+ " JNPKT | \n",
+ " 1765 | \n",
+ " genpact ltd | \n",
+ " 51537 | \n",
+ " genpact (uk) ltd | \n",
+ "
\n",
+ " \n",
+ " 223 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6181 | \n",
+ " 106386 | \n",
+ " perimeter solutions sa | \n",
+ " perimeter solutions sa | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " n4 | \n",
+ " grand of luxembourg | \n",
+ " 0 | \n",
+ " 0.000017 | \n",
+ " 0.000011 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " PRMTR SLXNS S | \n",
+ " PRMTR SLXNS S | \n",
+ " 6181 | \n",
+ " perimeter solutions, sa | \n",
+ " 106386 | \n",
+ " perimeter solutions sa | \n",
+ "
\n",
+ " \n",
+ " 390 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 949 | \n",
+ " 34324 | \n",
+ " ceva incorporated | \n",
+ " ceva incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " cayman islands | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.015387 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " SF | \n",
+ " SF | \n",
+ " 949 | \n",
+ " ceva inc | \n",
+ " 34324 | \n",
+ " ceva inc | \n",
+ "
\n",
+ " \n",
+ " 226 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6825 | \n",
+ " 123476 | \n",
+ " harte hanks incorporated | \n",
+ " harte hanks incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " ohio | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.008136 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " HRT HNKS | \n",
+ " HRT HNKS | \n",
+ " 6825 | \n",
+ " harte hanks inc | \n",
+ " 123476 | \n",
+ " harte hanks, inc | \n",
+ "
\n",
+ " \n",
+ " 228 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 234 | \n",
+ " 6600 | \n",
+ " jones lang lasalle incorporated | \n",
+ " jones lang lasalle incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " maryland | \n",
+ " puerto rico | \n",
+ " 0 | \n",
+ " 0.007786 | \n",
+ " 0.001548 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " JNS LNK LSL | \n",
+ " JNS LNK LSL | \n",
+ " 234 | \n",
+ " jones lang lasalle inc | \n",
+ " 6600 | \n",
+ " jones lang lasalle (puerto rico), inc | \n",
+ "
\n",
+ " \n",
+ " 229 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 234 | \n",
+ " 6596 | \n",
+ " jones lang lasalle incorporated | \n",
+ " jones lang lasalle incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " maryland | \n",
+ " philippines | \n",
+ " 0 | \n",
+ " 0.007786 | \n",
+ " 0.001927 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " JNS LNK LSL | \n",
+ " JNS LNK LSL | \n",
+ " 234 | \n",
+ " jones lang lasalle inc | \n",
+ " 6596 | \n",
+ " jones lang lasalle (philippines), inc | \n",
+ "
\n",
+ " \n",
+ " 231 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 2097 | \n",
+ " 54939 | \n",
+ " optimizerx corporation | \n",
+ " optimizerx corporation | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " nevada | \n",
+ " michigan | \n",
+ " 0 | \n",
+ " 0.014652 | \n",
+ " 0.007151 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " OPTMSRKS | \n",
+ " OPTMSRKS | \n",
+ " 2097 | \n",
+ " optimizerx corp | \n",
+ " 54939 | \n",
+ " optimizerx corporation | \n",
+ "
\n",
+ " \n",
+ " 201 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6176 | \n",
+ " 166072 | \n",
+ " phoenix motor incorporated | \n",
+ " phoenix motor incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " us | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.000908 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " FNKS MTR | \n",
+ " FNKS MTR | \n",
+ " 6176 | \n",
+ " phoenix motor inc. | \n",
+ " 166072 | \n",
+ " phoenix motor inc | \n",
+ "
\n",
+ " \n",
+ " 232 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 2117 | \n",
+ " 57288 | \n",
+ " transocean limited | \n",
+ " transocean limited | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " v8 | \n",
+ " switzerland | \n",
+ " 0 | \n",
+ " 0.000033 | \n",
+ " 0.006421 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " TRNSSN | \n",
+ " TRNSSN | \n",
+ " 2117 | \n",
+ " transocean ltd. | \n",
+ " 57288 | \n",
+ " transocean ltd | \n",
+ "
\n",
+ " \n",
+ " 421 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1348 | \n",
+ " 40725 | \n",
+ " lazard group limited liability company | \n",
+ " lazard group limited liability company | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " us | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.000908 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " LSRT KRP | \n",
+ " LSRT KRP | \n",
+ " 1348 | \n",
+ " lazard group llc | \n",
+ " 40725 | \n",
+ " lazard group llc | \n",
+ "
\n",
+ " \n",
+ " 169 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6922 | \n",
+ " 189462 | \n",
+ " analog devices incorporated | \n",
+ " analog devices incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " massachusetts | \n",
+ " united states | \n",
+ " 0 | \n",
+ " 0.004466 | \n",
+ " 0.012146 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " ANLK TFSS | \n",
+ " ANLK TFSS | \n",
+ " 6922 | \n",
+ " analog devices inc | \n",
+ " 189462 | \n",
+ " analog devices, inc | \n",
+ "
\n",
+ " \n",
+ " 115 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 2485 | \n",
+ " 167379 | \n",
+ " ameriguard security services incorporated | \n",
+ " ameriguard security services incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " nevada | \n",
+ " california | \n",
+ " 0 | \n",
+ " 0.014652 | \n",
+ " 0.015978 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " AMRKRT SKRT SRFSS | \n",
+ " AMRKRT SKRT SRFSS | \n",
+ " 2485 | \n",
+ " ameriguard security services, inc. | \n",
+ " 167379 | \n",
+ " ameriguard security services, inc | \n",
+ "
\n",
+ " \n",
+ " 116 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 2486 | \n",
+ " 167379 | \n",
+ " ameriguard security services incorporated | \n",
+ " ameriguard security services incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " nevada | \n",
+ " california | \n",
+ " 0 | \n",
+ " 0.014652 | \n",
+ " 0.015978 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " AMRKRT SKRT SRFSS | \n",
+ " AMRKRT SKRT SRFSS | \n",
+ " 2486 | \n",
+ " ameriguard security services, inc. | \n",
+ " 167379 | \n",
+ " ameriguard security services, inc | \n",
+ "
\n",
+ " \n",
+ " 120 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 4683 | \n",
+ " 95837 | \n",
+ " advantage solutions incorporated | \n",
+ " advantage solutions incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " canada | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.012191 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " ATFNTJ SLXNS | \n",
+ " ATFNTJ SLXNS | \n",
+ " 4683 | \n",
+ " advantage solutions inc. | \n",
+ " 95837 | \n",
+ " advantage solutions inc | \n",
+ "
\n",
+ " \n",
+ " 445 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 926 | \n",
+ " 165871 | \n",
+ " commvault systems incorporated | \n",
+ " commvault systems incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " ontario, canada | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.000852 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " KMFLT SSTMS | \n",
+ " KMFLT SSTMS | \n",
+ " 926 | \n",
+ " commvault systems inc | \n",
+ " 165871 | \n",
+ " commvault systems (canada) inc | \n",
+ "
\n",
+ " \n",
+ " 124 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 4148 | \n",
+ " 90738 | \n",
+ " firstsun capital bancorp | \n",
+ " firstsun capital bancorp | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " new mexico | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.000652 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " FRSTSN KPTL BNKRP | \n",
+ " FRSTSN KPTL BNKRP | \n",
+ " 4148 | \n",
+ " firstsun capital bancorp | \n",
+ " 90738 | \n",
+ " firstsun capital bancorp | \n",
+ "
\n",
+ " \n",
+ " 126 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 5544 | \n",
+ " 26048 | \n",
+ " taboola com limited | \n",
+ " taboola com limited | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " l3 | \n",
+ " israel | \n",
+ " 0 | \n",
+ " 0.000061 | \n",
+ " 0.003057 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " TBL KM | \n",
+ " TBL KM | \n",
+ " 5544 | \n",
+ " taboola.com ltd. | \n",
+ " 26048 | \n",
+ " taboola.com ltd | \n",
+ "
\n",
+ " \n",
+ " 443 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 2 | \n",
+ " 96 | \n",
+ " henry schein incorporated | \n",
+ " henry schein incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " pennsylvania | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.007919 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " HNR SXN | \n",
+ " HNR SXN | \n",
+ " 2 | \n",
+ " henry schein inc | \n",
+ " 96 | \n",
+ " henry schein (lancaster, pa) inc | \n",
+ "
\n",
+ " \n",
+ " 132 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6668 | \n",
+ " 117995 | \n",
+ " tomi environmental solutions incorporated | \n",
+ " tomi environmental solutions incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " florida | \n",
+ " nevada | \n",
+ " 0 | \n",
+ " 0.014691 | \n",
+ " 0.014652 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " TM ENFRNMNTL SLXNS | \n",
+ " TM ENFRNMNTL SLXNS | \n",
+ " 6668 | \n",
+ " tomi environmental solutions, inc. | \n",
+ " 117995 | \n",
+ " tomi environmental solutions, inc | \n",
+ "
\n",
+ " \n",
+ " 136 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6148 | \n",
+ " 107455 | \n",
+ " esab corporation | \n",
+ " esab corporation | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " united states | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.012146 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " ESB | \n",
+ " ESB | \n",
+ " 6148 | \n",
+ " esab corp | \n",
+ " 107455 | \n",
+ " esab corporation | \n",
+ "
\n",
+ " \n",
+ " 137 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6958 | \n",
+ " 104521 | \n",
+ " apache corporation | \n",
+ " apache corporation | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " new jersey | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.006143 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " APX | \n",
+ " APX | \n",
+ " 6958 | \n",
+ " apache corp | \n",
+ " 104521 | \n",
+ " apache corporation | \n",
+ "
\n",
+ " \n",
+ " 138 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 7011 | \n",
+ " 121758 | \n",
+ " ncr corporation | \n",
+ " ncr corporation | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " maryland | \n",
+ " new zealand | \n",
+ " 0 | \n",
+ " 0.007786 | \n",
+ " 0.002590 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " NKR | \n",
+ " NKR | \n",
+ " 7011 | \n",
+ " ncr corp | \n",
+ " 121758 | \n",
+ " ncr (nz) corporation | \n",
+ "
\n",
+ " \n",
+ " 423 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 77 | \n",
+ " 165059 | \n",
+ " jakks pacific incorporated | \n",
+ " jakks pacific incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " canada | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.012191 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " JKS PSFK | \n",
+ " JKS PSFK | \n",
+ " 77 | \n",
+ " jakks pacific inc | \n",
+ " 165059 | \n",
+ " jakks pacific (canada), inc | \n",
+ "
\n",
+ " \n",
+ " 139 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 4902 | \n",
+ " 170051 | \n",
+ " gan limited | \n",
+ " gan limited | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " d0 | \n",
+ " england and wales | \n",
+ " 0 | \n",
+ " 0.000150 | \n",
+ " 0.003536 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " KN | \n",
+ " KN | \n",
+ " 4902 | \n",
+ " gan ltd | \n",
+ " 170051 | \n",
+ " gan (uk) limited | \n",
+ "
\n",
+ " \n",
+ " 141 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6613 | \n",
+ " 108716 | \n",
+ " cts corporation | \n",
+ " cts corporation | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " indiana | \n",
+ " delaware | \n",
+ " 0 | \n",
+ " 0.004060 | \n",
+ " 0.372842 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " KTS | \n",
+ " KTS | \n",
+ " 6613 | \n",
+ " cts corp | \n",
+ " 108716 | \n",
+ " cts corporation | \n",
+ "
\n",
+ " \n",
+ " 437 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 738 | \n",
+ " 29776 | \n",
+ " garmin limited | \n",
+ " garmin limited | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " v8 | \n",
+ " thailand | \n",
+ " 0 | \n",
+ " 0.000033 | \n",
+ " 0.002378 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " KRMN | \n",
+ " KRMN | \n",
+ " 738 | \n",
+ " garmin ltd | \n",
+ " 29776 | \n",
+ " garmin (thailand) ltd | \n",
+ "
\n",
+ " \n",
+ " 435 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 277 | \n",
+ " 9849 | \n",
+ " c h robinson worldwide incorporated | \n",
+ " c h robinson worldwide incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " united states | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.012146 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " K H RBNSN WRLTWT | \n",
+ " K H RBNSN WRLTWT | \n",
+ " 277 | \n",
+ " c. h. robinson worldwide, inc. | \n",
+ " 9849 | \n",
+ " c.h. robinson worldwide, inc | \n",
+ "
\n",
+ " \n",
+ " 146 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6763 | \n",
+ " 176423 | \n",
+ " richardson electronics limited | \n",
+ " richardson electronics limited | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " thailand | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.002378 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " RXRTSN ELKTRNKS | \n",
+ " RXRTSN ELKTRNKS | \n",
+ " 6763 | \n",
+ " richardson electronics, ltd. | \n",
+ " 176423 | \n",
+ " richardson electronics (thailand) limited | \n",
+ "
\n",
+ " \n",
+ " 149 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 4875 | \n",
+ " 98755 | \n",
+ " api group corporation | \n",
+ " api group corporation | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " d8 | \n",
+ " delaware | \n",
+ " 0 | \n",
+ " 0.000078 | \n",
+ " 0.372842 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " AP KRP | \n",
+ " AP KRP | \n",
+ " 4875 | \n",
+ " api group corp | \n",
+ " 98755 | \n",
+ " api group corporation | \n",
+ "
\n",
+ " \n",
+ " 432 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 2310 | \n",
+ " 167475 | \n",
+ " thermon group holdings incorporated | \n",
+ " thermon group holdings incorporated | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " delaware, united states | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.002139 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " 0RMN KRP HLTNKS | \n",
+ " 0RMN KRP HLTNKS | \n",
+ " 2310 | \n",
+ " thermon group holdings, inc. | \n",
+ " 167475 | \n",
+ " thermon group holdings, inc | \n",
+ "
\n",
+ " \n",
+ " 156 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 6677 | \n",
+ " 118432 | \n",
+ " aon public limited company | \n",
+ " aon public limited company | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " l2 | \n",
+ " ireland | \n",
+ " 0 | \n",
+ " 0.000111 | \n",
+ " 0.008315 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " AN | \n",
+ " AN | \n",
+ " 6677 | \n",
+ " aon plc | \n",
+ " 118432 | \n",
+ " aon plc | \n",
+ "
\n",
+ " \n",
+ " 158 | \n",
+ " 6.816691 | \n",
+ " 0.991207 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 5955 | \n",
+ " 80272 | \n",
+ " minority equality opportunities acquisition in... | \n",
+ " minority equality opportunities acquisition in... | \n",
+ " 1 | \n",
+ " 2.492261e+06 | \n",
+ " delaware | \n",
+ " delaware, united states | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.002139 | \n",
+ " 0.551065 | \n",
+ " 1.0 | \n",
+ " MNRT EKLT OPRTNTS AKKSXN | \n",
+ " MNRT EKLT OPRTNTS AKKSXN | \n",
+ " 5955 | \n",
+ " minority equality opportunities acquisition inc. | \n",
+ " 80272 | \n",
+ " minority equality opportunities acquisition inc | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_l company_name_r gamma_company_name bf_company_name loc_of_incorporation_l loc_of_incorporation_r gamma_loc_of_incorporation tf_loc_of_incorporation_l tf_loc_of_incorporation_r bf_loc_of_incorporation bf_tf_adj_loc_of_incorporation company_name_mphone_l company_name_mphone_r record_id_x company_name_sec record_id_y company_name_ex21\n",
+ "0 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 8180 159390 national instruments corporation national instruments corporation 1 2.492261e+06 delaware republic of korea 0 0.372842 0.000234 0.551065 1.0 NXNL INSTRMNTS NXNL INSTRMNTS 8180 national instruments corp 159390 national instruments (korea) corporation\n",
+ "176 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6034 107265 afternext healthtech acquisition corporation afternext healthtech acquisition corporation 1 2.492261e+06 e9 cayman islands 0 0.001069 0.015387 0.551065 1.0 AFTRNKST HL0TX AKKSXN AFTRNKST HL0TX AKKSXN 6034 afternext healthtech acquisition corp. 107265 afternext healthtech acquisition corp\n",
+ "178 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6799 117610 gap incorporated gap incorporated 1 2.492261e+06 delaware puerto rico 0 0.372842 0.001548 0.551065 1.0 KP KP 6799 gap inc 117610 gap (puerto rico), inc\n",
+ "183 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5811 170135 rockley photonics holdings limited rockley photonics holdings limited 1 2.492261e+06 e9 cayman islands 0 0.001069 0.015387 0.551065 1.0 RKL FTNKS HLTNKS RKL FTNKS HLTNKS 5811 rockley photonics holdings ltd 170135 rockley photonics holdings limited\n",
+ "184 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6799 117608 gap incorporated gap incorporated 1 2.492261e+06 delaware california 0 0.372842 0.015978 0.551065 1.0 KP KP 6799 gap inc 117608 gap (itm) inc\n",
+ "186 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6799 117605 gap incorporated gap incorporated 1 2.492261e+06 delaware canada 0 0.372842 0.012191 0.551065 1.0 KP KP 6799 gap inc 117605 gap (canada) inc\n",
+ "412 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1524 165843 aircastle limited aircastle limited 1 2.492261e+06 d0 ireland 0 0.000150 0.008315 0.551065 1.0 ARKSTL ARKSTL 1524 aircastle ltd 165843 aircastle (ireland) limited\n",
+ "189 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6753 115383 arthur j gallagher and company arthur j gallagher and company 1 2.492261e+06 illinois delaware 0 0.006115 0.372842 0.551065 1.0 AR0R J KLKHR ANT AR0R J KLKHR ANT 6753 arthur j. gallagher & co. 115383 arthur j. gallagher & co\n",
+ "193 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6651 110797 flowserve corporation flowserve corporation 1 2.492261e+06 new york mauritius 0 0.009913 0.001075 0.551065 1.0 FLSRF FLSRF 6651 flowserve corp 110797 flowserve (mauritius) corporation\n",
+ "406 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 578 24844 united parcel service incorporated united parcel service incorporated 1 2.492261e+06 delaware ohio 0 0.372842 0.008136 0.551065 1.0 UNTT PRSL SRFS UNTT PRSL SRFS 578 united parcel service inc 24844 united parcel service, inc\n",
+ "198 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5812 171905 nextracker incorporated nextracker incorporated 1 2.492261e+06 delaware united states delaware 0 0.372842 0.002278 0.551065 1.0 NKSTRKR NKSTRKR 5812 nextracker inc. 171905 nextracker inc\n",
+ "199 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5843 51850 sculptor acquisition corp i sculptor acquisition corp i 1 2.492261e+06 e9 cayman islands 0 0.001069 0.015387 0.551065 1.0 SKLPTR AKKSXN I SKLPTR AKKSXN I 5843 sculptor acquisition corp i 51850 sculptor acquisition corp i\n",
+ "174 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 7095 179994 cintas corporation cintas corporation 1 2.492261e+06 washington nevada 0 0.002996 0.014652 0.551065 1.0 SNTS SNTS 7095 cintas corp 179994 cintas corporation\n",
+ "405 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 285 12641 onespan incorporated onespan incorporated 1 2.492261e+06 delaware usa, state of delaware 0 0.372842 0.000011 0.551065 1.0 ONSPN ONSPN 285 onespan inc. 12641 onespan inc\n",
+ "207 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6282 97173 mars acquisition corporation mars acquisition corporation 1 2.492261e+06 e9 delaware 0 0.001069 0.372842 0.551065 1.0 MRS AKKSXN MRS AKKSXN 6282 mars acquisition corp. 97173 mars acquisition corp\n",
+ "212 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4834 97747 viatris incorporated viatris incorporated 1 2.492261e+06 delaware philippines 0 0.372842 0.001927 0.551065 1.0 FTRS FTRS 4834 viatris inc 97747 viatris, inc\n",
+ "397 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1205 35911 turning point brands incorporated turning point brands incorporated 1 2.492261e+06 delaware ontario, canada 0 0.372842 0.000852 0.551065 1.0 TRNNK PNT BRNTS TRNNK PNT BRNTS 1205 turning point brands, inc. 35911 turning point brands (canada) inc\n",
+ "396 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1171 35941 clearpoint neuro incorporated clearpoint neuro incorporated 1 2.492261e+06 delaware canada new brunswick 0 0.372842 0.000006 0.551065 1.0 KLRPNT NR KLRPNT NR 1171 clearpoint neuro, inc. 35941 clearpoint neuro (canada) inc\n",
+ "393 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1765 51537 genpact limited genpact limited 1 2.492261e+06 d0 united kingdom 0 0.000150 0.031521 0.551065 1.0 JNPKT JNPKT 1765 genpact ltd 51537 genpact (uk) ltd\n",
+ "223 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6181 106386 perimeter solutions sa perimeter solutions sa 1 2.492261e+06 n4 grand of luxembourg 0 0.000017 0.000011 0.551065 1.0 PRMTR SLXNS S PRMTR SLXNS S 6181 perimeter solutions, sa 106386 perimeter solutions sa\n",
+ "390 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 949 34324 ceva incorporated ceva incorporated 1 2.492261e+06 delaware cayman islands 0 0.372842 0.015387 0.551065 1.0 SF SF 949 ceva inc 34324 ceva inc\n",
+ "226 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6825 123476 harte hanks incorporated harte hanks incorporated 1 2.492261e+06 delaware ohio 0 0.372842 0.008136 0.551065 1.0 HRT HNKS HRT HNKS 6825 harte hanks inc 123476 harte hanks, inc\n",
+ "228 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 234 6600 jones lang lasalle incorporated jones lang lasalle incorporated 1 2.492261e+06 maryland puerto rico 0 0.007786 0.001548 0.551065 1.0 JNS LNK LSL JNS LNK LSL 234 jones lang lasalle inc 6600 jones lang lasalle (puerto rico), inc\n",
+ "229 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 234 6596 jones lang lasalle incorporated jones lang lasalle incorporated 1 2.492261e+06 maryland philippines 0 0.007786 0.001927 0.551065 1.0 JNS LNK LSL JNS LNK LSL 234 jones lang lasalle inc 6596 jones lang lasalle (philippines), inc\n",
+ "231 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2097 54939 optimizerx corporation optimizerx corporation 1 2.492261e+06 nevada michigan 0 0.014652 0.007151 0.551065 1.0 OPTMSRKS OPTMSRKS 2097 optimizerx corp 54939 optimizerx corporation\n",
+ "201 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6176 166072 phoenix motor incorporated phoenix motor incorporated 1 2.492261e+06 delaware us 0 0.372842 0.000908 0.551065 1.0 FNKS MTR FNKS MTR 6176 phoenix motor inc. 166072 phoenix motor inc\n",
+ "232 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2117 57288 transocean limited transocean limited 1 2.492261e+06 v8 switzerland 0 0.000033 0.006421 0.551065 1.0 TRNSSN TRNSSN 2117 transocean ltd. 57288 transocean ltd\n",
+ "421 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1348 40725 lazard group limited liability company lazard group limited liability company 1 2.492261e+06 delaware us 0 0.372842 0.000908 0.551065 1.0 LSRT KRP LSRT KRP 1348 lazard group llc 40725 lazard group llc\n",
+ "169 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6922 189462 analog devices incorporated analog devices incorporated 1 2.492261e+06 massachusetts united states 0 0.004466 0.012146 0.551065 1.0 ANLK TFSS ANLK TFSS 6922 analog devices inc 189462 analog devices, inc\n",
+ "115 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2485 167379 ameriguard security services incorporated ameriguard security services incorporated 1 2.492261e+06 nevada california 0 0.014652 0.015978 0.551065 1.0 AMRKRT SKRT SRFSS AMRKRT SKRT SRFSS 2485 ameriguard security services, inc. 167379 ameriguard security services, inc\n",
+ "116 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2486 167379 ameriguard security services incorporated ameriguard security services incorporated 1 2.492261e+06 nevada california 0 0.014652 0.015978 0.551065 1.0 AMRKRT SKRT SRFSS AMRKRT SKRT SRFSS 2486 ameriguard security services, inc. 167379 ameriguard security services, inc\n",
+ "120 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4683 95837 advantage solutions incorporated advantage solutions incorporated 1 2.492261e+06 delaware canada 0 0.372842 0.012191 0.551065 1.0 ATFNTJ SLXNS ATFNTJ SLXNS 4683 advantage solutions inc. 95837 advantage solutions inc\n",
+ "445 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 926 165871 commvault systems incorporated commvault systems incorporated 1 2.492261e+06 delaware ontario, canada 0 0.372842 0.000852 0.551065 1.0 KMFLT SSTMS KMFLT SSTMS 926 commvault systems inc 165871 commvault systems (canada) inc\n",
+ "124 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4148 90738 firstsun capital bancorp firstsun capital bancorp 1 2.492261e+06 delaware new mexico 0 0.372842 0.000652 0.551065 1.0 FRSTSN KPTL BNKRP FRSTSN KPTL BNKRP 4148 firstsun capital bancorp 90738 firstsun capital bancorp\n",
+ "126 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5544 26048 taboola com limited taboola com limited 1 2.492261e+06 l3 israel 0 0.000061 0.003057 0.551065 1.0 TBL KM TBL KM 5544 taboola.com ltd. 26048 taboola.com ltd\n",
+ "443 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2 96 henry schein incorporated henry schein incorporated 1 2.492261e+06 delaware pennsylvania 0 0.372842 0.007919 0.551065 1.0 HNR SXN HNR SXN 2 henry schein inc 96 henry schein (lancaster, pa) inc\n",
+ "132 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6668 117995 tomi environmental solutions incorporated tomi environmental solutions incorporated 1 2.492261e+06 florida nevada 0 0.014691 0.014652 0.551065 1.0 TM ENFRNMNTL SLXNS TM ENFRNMNTL SLXNS 6668 tomi environmental solutions, inc. 117995 tomi environmental solutions, inc\n",
+ "136 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6148 107455 esab corporation esab corporation 1 2.492261e+06 delaware united states 0 0.372842 0.012146 0.551065 1.0 ESB ESB 6148 esab corp 107455 esab corporation\n",
+ "137 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6958 104521 apache corporation apache corporation 1 2.492261e+06 delaware new jersey 0 0.372842 0.006143 0.551065 1.0 APX APX 6958 apache corp 104521 apache corporation\n",
+ "138 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 7011 121758 ncr corporation ncr corporation 1 2.492261e+06 maryland new zealand 0 0.007786 0.002590 0.551065 1.0 NKR NKR 7011 ncr corp 121758 ncr (nz) corporation\n",
+ "423 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 77 165059 jakks pacific incorporated jakks pacific incorporated 1 2.492261e+06 delaware canada 0 0.372842 0.012191 0.551065 1.0 JKS PSFK JKS PSFK 77 jakks pacific inc 165059 jakks pacific (canada), inc\n",
+ "139 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4902 170051 gan limited gan limited 1 2.492261e+06 d0 england and wales 0 0.000150 0.003536 0.551065 1.0 KN KN 4902 gan ltd 170051 gan (uk) limited\n",
+ "141 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6613 108716 cts corporation cts corporation 1 2.492261e+06 indiana delaware 0 0.004060 0.372842 0.551065 1.0 KTS KTS 6613 cts corp 108716 cts corporation\n",
+ "437 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 738 29776 garmin limited garmin limited 1 2.492261e+06 v8 thailand 0 0.000033 0.002378 0.551065 1.0 KRMN KRMN 738 garmin ltd 29776 garmin (thailand) ltd\n",
+ "435 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 277 9849 c h robinson worldwide incorporated c h robinson worldwide incorporated 1 2.492261e+06 delaware united states 0 0.372842 0.012146 0.551065 1.0 K H RBNSN WRLTWT K H RBNSN WRLTWT 277 c. h. robinson worldwide, inc. 9849 c.h. robinson worldwide, inc\n",
+ "146 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6763 176423 richardson electronics limited richardson electronics limited 1 2.492261e+06 delaware thailand 0 0.372842 0.002378 0.551065 1.0 RXRTSN ELKTRNKS RXRTSN ELKTRNKS 6763 richardson electronics, ltd. 176423 richardson electronics (thailand) limited\n",
+ "149 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4875 98755 api group corporation api group corporation 1 2.492261e+06 d8 delaware 0 0.000078 0.372842 0.551065 1.0 AP KRP AP KRP 4875 api group corp 98755 api group corporation\n",
+ "432 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2310 167475 thermon group holdings incorporated thermon group holdings incorporated 1 2.492261e+06 delaware delaware, united states 0 0.372842 0.002139 0.551065 1.0 0RMN KRP HLTNKS 0RMN KRP HLTNKS 2310 thermon group holdings, inc. 167475 thermon group holdings, inc\n",
+ "156 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6677 118432 aon public limited company aon public limited company 1 2.492261e+06 l2 ireland 0 0.000111 0.008315 0.551065 1.0 AN AN 6677 aon plc 118432 aon plc\n",
+ "158 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5955 80272 minority equality opportunities acquisition in... minority equality opportunities acquisition in... 1 2.492261e+06 delaware delaware, united states 0 0.372842 0.002139 0.551065 1.0 MNRT EKLT OPRTNTS AKKSXN MNRT EKLT OPRTNTS AKKSXN 5955 minority equality opportunities acquisition inc. 80272 minority equality opportunities acquisition inc"
+ ]
+ },
+ "execution_count": 133,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preds_df.sort_values(by=\"match_probability\").iloc[0:50]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 109,
+ "id": "59cc74aa-674b-4c89-95d6-181d0f7c162a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " source_dataset_l | \n",
+ " source_dataset_r | \n",
+ " record_id_l | \n",
+ " record_id_r | \n",
+ " company_name_l | \n",
+ " company_name_r | \n",
+ " gamma_company_name | \n",
+ " bf_company_name | \n",
+ " loc_of_incorporation_l | \n",
+ " loc_of_incorporation_r | \n",
+ " gamma_loc_of_incorporation | \n",
+ " tf_loc_of_incorporation_l | \n",
+ " tf_loc_of_incorporation_r | \n",
+ " bf_loc_of_incorporation | \n",
+ " bf_tf_adj_loc_of_incorporation | \n",
+ " company_name_mphone_l | \n",
+ " company_name_mphone_r | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 8180 | \n",
+ " 159390 | \n",
+ " national instruments corporation | \n",
+ " national instruments corporation | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " republic of korea | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.000234 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " NXNL INSTRMNTS | \n",
+ " NXNL INSTRMNTS | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 7912 | \n",
+ " 154757 | \n",
+ " enbridge incorporated | \n",
+ " enbridge incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " a0 | \n",
+ " alberta | \n",
+ " 0 | \n",
+ " 0.000033 | \n",
+ " 0.000880 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " ENBRJ | \n",
+ " ENBRJ | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 7557 | \n",
+ " 140921 | \n",
+ " spectrum pharmaceuticals incorporated | \n",
+ " spectrum pharmaceuticals incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " cayman islands | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.015387 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " SPKTRM FRMSTKLS | \n",
+ " SPKTRM FRMSTKLS | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 7.717639 | \n",
+ " 0.995272 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 8057 | \n",
+ " 152329 | \n",
+ " american eagle outfitters incorporated | \n",
+ " american eagle outfitters incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " delaware | \n",
+ " 2 | \n",
+ " 0.372842 | \n",
+ " 0.372842 | \n",
+ " 2.487467 | \n",
+ " 0.581079 | \n",
+ " AMRKN EKL OTFTRS | \n",
+ " AMRKN EKL OTFTRS | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 14.126362 | \n",
+ " 0.999944 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 7315 | \n",
+ " 28974 | \n",
+ " pruco life insurance company | \n",
+ " pruco life insurance company | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " arizona | \n",
+ " arizona | \n",
+ " 2 | \n",
+ " 0.004388 | \n",
+ " 0.004388 | \n",
+ " 2.487467 | \n",
+ " 49.368830 | \n",
+ " PRK LF INSRNS | \n",
+ " PRK LF INSRNS | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 7.186156 | \n",
+ " 0.993180 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 7419 | \n",
+ " 142779 | \n",
+ " national presto industries incorporated | \n",
+ " national presto industries incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " wisconsin | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.004110 | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " NXNL PRST INTSTRS | \n",
+ " NXNL PRST INTSTRS | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 7387 | \n",
+ " 142016 | \n",
+ " national bankshares incorporated | \n",
+ " national bankshares incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " virginia | \n",
+ " commonwealth virginia | \n",
+ " 0 | \n",
+ " 0.006276 | \n",
+ " 0.000022 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " NXNL BNKXRS | \n",
+ " NXNL BNKXRS | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 13.610142 | \n",
+ " 0.999920 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 7387 | \n",
+ " 127697 | \n",
+ " national bankshares incorporated | \n",
+ " national bankshares incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " virginia | \n",
+ " virginia | \n",
+ " 2 | \n",
+ " 0.006276 | \n",
+ " 0.006276 | \n",
+ " 2.487467 | \n",
+ " 34.518756 | \n",
+ " NXNL BNKXRS | \n",
+ " NXNL BNKXRS | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 7.717639 | \n",
+ " 0.995272 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 8258 | \n",
+ " 162906 | \n",
+ " thermo fisher scientific incorporated | \n",
+ " thermo fisher scientific incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " delaware | \n",
+ " 2 | \n",
+ " 0.372842 | \n",
+ " 0.372842 | \n",
+ " 2.487467 | \n",
+ " 0.581079 | \n",
+ " 0RM FXR SSNTFK | \n",
+ " 0RM FXR SSNTFK | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 12.101855 | \n",
+ " 0.999773 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 7428 | \n",
+ " 60197 | \n",
+ " general motors financial company incorporated | \n",
+ " general motors financial company incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " texas | \n",
+ " texas | \n",
+ " 2 | \n",
+ " 0.017854 | \n",
+ " 0.017854 | \n",
+ " 2.487467 | \n",
+ " 12.134323 | \n",
+ " JNRL MTRS FNNXL | \n",
+ " JNRL MTRS FNNXL | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 8258 | \n",
+ " 163501 | \n",
+ " thermo fisher scientific incorporated | \n",
+ " thermo fisher scientific incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " mexico | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.011205 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " 0RM FXR SSNTFK | \n",
+ " 0RM FXR SSNTFK | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 5498 | \n",
+ " 52885 | \n",
+ " apollo strategic growth capital ii | \n",
+ " apollo strategic growth capital ii | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " e9 | \n",
+ " cayman islands | \n",
+ " 0 | \n",
+ " 0.001069 | \n",
+ " 0.015387 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " APL STRTJK KR0 KPTL | \n",
+ " APL STRTJK KR0 KPTL | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 8258 | \n",
+ " 162892 | \n",
+ " thermo fisher scientific incorporated | \n",
+ " thermo fisher scientific incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " canada | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.012191 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " 0RM FXR SSNTFK | \n",
+ " 0RM FXR SSNTFK | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 8258 | \n",
+ " 162847 | \n",
+ " thermo fisher scientific incorporated | \n",
+ " thermo fisher scientific incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " russia | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.001108 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " 0RM FXR SSNTFK | \n",
+ " 0RM FXR SSNTFK | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 498 | \n",
+ " 18301 | \n",
+ " intellinetics incorporated | \n",
+ " intellinetics incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " nevada | \n",
+ " ohio | \n",
+ " 0 | \n",
+ " 0.014652 | \n",
+ " 0.008136 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " INTLNTKS | \n",
+ " INTLNTKS | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1533 | \n",
+ " 165897 | \n",
+ " high sierra technologies incorporated | \n",
+ " high sierra technologies incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " colorado | \n",
+ " nevada | \n",
+ " 0 | \n",
+ " 0.004817 | \n",
+ " 0.014652 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " H SR TXNLJS | \n",
+ " H SR TXNLJS | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 13.991858 | \n",
+ " 0.999939 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 2127 | \n",
+ " 61213 | \n",
+ " lnpr group incorporated | \n",
+ " lnpr group incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " colorado | \n",
+ " colorado | \n",
+ " 2 | \n",
+ " 0.004817 | \n",
+ " 0.004817 | \n",
+ " 2.487467 | \n",
+ " 44.974148 | \n",
+ " LNPR KRP | \n",
+ " LNPR KRP | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 7.186156 | \n",
+ " 0.993180 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 93 | \n",
+ " 1969 | \n",
+ " norwood financial corporation | \n",
+ " norwood financial corporation | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " pennsylvania | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.007919 | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " NRWT FNNXL | \n",
+ " NRWT FNNXL | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 151 | \n",
+ " 2257 | \n",
+ " nov incorporated | \n",
+ " nov incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " mauritius | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.001075 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " NF | \n",
+ " NF | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 280 | \n",
+ " 10975 | \n",
+ " juniper networks incorporated | \n",
+ " juniper networks incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " california, usa | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.000234 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " JNPR NTWRKS | \n",
+ " JNPR NTWRKS | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 3.252392 | \n",
+ " 0.905028 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1399 | \n",
+ " 157790 | \n",
+ " logiq incorporated | \n",
+ " logiq3 incorporated | \n",
+ " 1 | \n",
+ " 2.087284e+05 | \n",
+ " delaware | \n",
+ " canada | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.012191 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " LJK | \n",
+ " LJK | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " 7.717639 | \n",
+ " 0.995272 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1720 | \n",
+ " 166283 | \n",
+ " edgio incorporated | \n",
+ " edgio incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " delaware | \n",
+ " 2 | \n",
+ " 0.372842 | \n",
+ " 0.372842 | \n",
+ " 2.487467 | \n",
+ " 0.581079 | \n",
+ " EJ | \n",
+ " EJ | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 2020 | \n",
+ " 184709 | \n",
+ " arem pacific corporation | \n",
+ " arem pacific corporation | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " arizona | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.004388 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " ARM PSFK | \n",
+ " ARM PSFK | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " 7.186156 | \n",
+ " 0.993180 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 756 | \n",
+ " 26596 | \n",
+ " ensign group incorporated | \n",
+ " ensign group incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " None | \n",
+ " nevada | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " 0.014652 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " ENSKN KRP | \n",
+ " ENSKN KRP | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " 7.186156 | \n",
+ " 0.993180 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1104 | \n",
+ " 24668 | \n",
+ " cco holdings limited liability company | \n",
+ " cco holdings limited liability company | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " None | \n",
+ " delaware | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " 0.372842 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " KK HLTNKS | \n",
+ " KK HLTNKS | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " 7.717639 | \n",
+ " 0.995272 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 321 | \n",
+ " 11011 | \n",
+ " pc connection incorporated | \n",
+ " pc connection incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " delaware | \n",
+ " 2 | \n",
+ " 0.372842 | \n",
+ " 0.372842 | \n",
+ " 2.487467 | \n",
+ " 0.581079 | \n",
+ " KNKXN | \n",
+ " KNKXN | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 477 | \n",
+ " 14483 | \n",
+ " polarityte incorporated | \n",
+ " polarityte incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " nevada | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.014652 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " PLRTT | \n",
+ " PLRTT | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " 7.717639 | \n",
+ " 0.995272 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 810 | \n",
+ " 25991 | \n",
+ " atlas air worldwide holdings incorporated | \n",
+ " atlas air worldwide holdings incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " delaware | \n",
+ " 2 | \n",
+ " 0.372842 | \n",
+ " 0.372842 | \n",
+ " 2.487467 | \n",
+ " 0.581079 | \n",
+ " ATLS AR WRLTWT HLTNKS | \n",
+ " ATLS AR WRLTWT HLTNKS | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1003 | \n",
+ " 166010 | \n",
+ " spi energy co limited | \n",
+ " spi energy co limited | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " e9 | \n",
+ " cayman | \n",
+ " 0 | \n",
+ " 0.001069 | \n",
+ " 0.000345 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " SP ENRJ | \n",
+ " SP ENRJ | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " 7.717639 | \n",
+ " 0.995272 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1012 | \n",
+ " 165926 | \n",
+ " bimi international medical incorporated | \n",
+ " bimi international medical incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " delaware | \n",
+ " 2 | \n",
+ " 0.372842 | \n",
+ " 0.372842 | \n",
+ " 2.487467 | \n",
+ " 0.581079 | \n",
+ " BM INTRNXNL MTKL | \n",
+ " BM INTRNXNL MTKL | \n",
+ "
\n",
+ " \n",
+ " 30 | \n",
+ " 7.186156 | \n",
+ " 0.993180 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1868 | \n",
+ " 51876 | \n",
+ " phreesia incorporated | \n",
+ " phreesia incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.372842 | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " FRX | \n",
+ " FRX | \n",
+ "
\n",
+ " \n",
+ " 31 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 2198 | \n",
+ " 78290 | \n",
+ " secureworks corporation | \n",
+ " secureworks corporation | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " united states | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.012146 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " SKRWRKS | \n",
+ " SKRWRKS | \n",
+ "
\n",
+ " \n",
+ " 32 | \n",
+ " 7.717639 | \n",
+ " 0.995272 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 2273 | \n",
+ " 58771 | \n",
+ " ryerson holding corporation | \n",
+ " ryerson holding corporation | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " delaware | \n",
+ " 2 | \n",
+ " 0.372842 | \n",
+ " 0.372842 | \n",
+ " 2.487467 | \n",
+ " 0.581079 | \n",
+ " RYRSN HLTNK | \n",
+ " RYRSN HLTNK | \n",
+ "
\n",
+ " \n",
+ " 33 | \n",
+ " 7.186156 | \n",
+ " 0.993180 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 221 | \n",
+ " 9106 | \n",
+ " comfort systems usa incorporated | \n",
+ " comfort systems usa incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " None | \n",
+ " arkansas | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " 0.001253 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " KMFRT SSTMS US | \n",
+ " KMFRT SSTMS US | \n",
+ "
\n",
+ " \n",
+ " 34 | \n",
+ " 14.351809 | \n",
+ " 0.999952 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 478 | \n",
+ " 180383 | \n",
+ " winnebago industries incorporated | \n",
+ " winnebago industries incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " minnesota | \n",
+ " minnesota | \n",
+ " 2 | \n",
+ " 0.003754 | \n",
+ " 0.003754 | \n",
+ " 2.487467 | \n",
+ " 57.719048 | \n",
+ " WNBK INTSTRS | \n",
+ " WNBK INTSTRS | \n",
+ "
\n",
+ " \n",
+ " 35 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1913 | \n",
+ " 166068 | \n",
+ " renewable energy acquisition corporation | \n",
+ " renewable energy acquisition corporation | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " nevada | \n",
+ " us | \n",
+ " 0 | \n",
+ " 0.014652 | \n",
+ " 0.000908 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " RNWBL ENRJ AKKSXN | \n",
+ " RNWBL ENRJ AKKSXN | \n",
+ "
\n",
+ " \n",
+ " 36 | \n",
+ " 7.186156 | \n",
+ " 0.993180 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 257 | \n",
+ " 164606 | \n",
+ " riverview bancorp incorporated | \n",
+ " riverview bancorp incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " washington | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.002996 | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " RFRF BNKRP | \n",
+ " RFRF BNKRP | \n",
+ "
\n",
+ " \n",
+ " 37 | \n",
+ " 7.186156 | \n",
+ " 0.993180 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 294 | \n",
+ " 182945 | \n",
+ " timberland bancorp incorporated | \n",
+ " timberland bancorp incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " washington | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.002996 | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " TMBRLNT BNKRP | \n",
+ " TMBRLNT BNKRP | \n",
+ "
\n",
+ " \n",
+ " 38 | \n",
+ " 7.717639 | \n",
+ " 0.995272 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 415 | \n",
+ " 18543 | \n",
+ " lkq corporation | \n",
+ " lkq corporation | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " delaware | \n",
+ " 2 | \n",
+ " 0.372842 | \n",
+ " 0.372842 | \n",
+ " 2.487467 | \n",
+ " 0.581079 | \n",
+ " LKK | \n",
+ " LKK | \n",
+ "
\n",
+ " \n",
+ " 39 | \n",
+ " 7.717639 | \n",
+ " 0.995272 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 674 | \n",
+ " 23252 | \n",
+ " berkshire hills bancorp incorporated | \n",
+ " berkshire hills bancorp incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " delaware | \n",
+ " 2 | \n",
+ " 0.372842 | \n",
+ " 0.372842 | \n",
+ " 2.487467 | \n",
+ " 0.581079 | \n",
+ " BRKXR HLS BNKRP | \n",
+ " BRKXR HLS BNKRP | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1270 | \n",
+ " 181001 | \n",
+ " dolby laboratories incorporated | \n",
+ " dolby laboratories incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " california | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.015978 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " TLB LBRTRS | \n",
+ " TLB LBRTRS | \n",
+ "
\n",
+ " \n",
+ " 41 | \n",
+ " 3.252392 | \n",
+ " 0.905028 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1321 | \n",
+ " 132984 | \n",
+ " tss incorporated | \n",
+ " dss incorporated | \n",
+ " 1 | \n",
+ " 2.087284e+05 | \n",
+ " delaware | \n",
+ " new york | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.009913 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " TS | \n",
+ " TS | \n",
+ "
\n",
+ " \n",
+ " 42 | \n",
+ " 7.717639 | \n",
+ " 0.995272 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1482 | \n",
+ " 46045 | \n",
+ " anywhere real estate incorporated | \n",
+ " anywhere real estate incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " delaware | \n",
+ " 2 | \n",
+ " 0.372842 | \n",
+ " 0.372842 | \n",
+ " 2.487467 | \n",
+ " 0.581079 | \n",
+ " ANHR RL ESTT | \n",
+ " ANHR RL ESTT | \n",
+ "
\n",
+ " \n",
+ " 43 | \n",
+ " 6.339909 | \n",
+ " 0.987805 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1494 | \n",
+ " 47625 | \n",
+ " kbr incorporated | \n",
+ " kbr incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " united states | \n",
+ " 0 | \n",
+ " 0.372842 | \n",
+ " 0.012146 | \n",
+ " 0.556230 | \n",
+ " 1.000000 | \n",
+ " KBR | \n",
+ " KBR | \n",
+ "
\n",
+ " \n",
+ " 44 | \n",
+ " 7.717639 | \n",
+ " 0.995272 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1972 | \n",
+ " 166348 | \n",
+ " reshape lifesciences incorporated | \n",
+ " reshape lifesciences incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " delaware | \n",
+ " 2 | \n",
+ " 0.372842 | \n",
+ " 0.372842 | \n",
+ " 2.487467 | \n",
+ " 0.581079 | \n",
+ " RXP LFSSNSS | \n",
+ " RXP LFSSNSS | \n",
+ "
\n",
+ " \n",
+ " 45 | \n",
+ " 12.387018 | \n",
+ " 0.999813 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1457 | \n",
+ " 172081 | \n",
+ " imperalis holding corporation | \n",
+ " imperalis holding corporation | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " nevada | \n",
+ " nevada | \n",
+ " 2 | \n",
+ " 0.014652 | \n",
+ " 0.014652 | \n",
+ " 2.487467 | \n",
+ " 14.786255 | \n",
+ " IMPRLS HLTNK | \n",
+ " IMPRLS HLTNK | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " 12.387018 | \n",
+ " 0.999813 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 2037 | \n",
+ " 172091 | \n",
+ " bitnile metaverse incorporated | \n",
+ " bitnile metaverse incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " nevada | \n",
+ " nevada | \n",
+ " 2 | \n",
+ " 0.014652 | \n",
+ " 0.014652 | \n",
+ " 2.487467 | \n",
+ " 14.786255 | \n",
+ " BTNL MTFRS | \n",
+ " BTNL MTFRS | \n",
+ "
\n",
+ " \n",
+ " 47 | \n",
+ " 7.717639 | \n",
+ " 0.995272 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1058 | \n",
+ " 35808 | \n",
+ " qvc incorporated | \n",
+ " qvc incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " delaware | \n",
+ " 2 | \n",
+ " 0.372842 | \n",
+ " 0.372842 | \n",
+ " 2.487467 | \n",
+ " 0.581079 | \n",
+ " KFK | \n",
+ " KFK | \n",
+ "
\n",
+ " \n",
+ " 48 | \n",
+ " 9.692877 | \n",
+ " 0.998793 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1705 | \n",
+ " 47703 | \n",
+ " irhythm technologies incorporated | \n",
+ " irhythm technologies incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " delaware | \n",
+ " us delaware | \n",
+ " 1 | \n",
+ " 0.372842 | \n",
+ " 0.000323 | \n",
+ " 5.683268 | \n",
+ " 1.000000 | \n",
+ " IRH0M TXNLJS | \n",
+ " IRH0M TXNLJS | \n",
+ "
\n",
+ " \n",
+ " 49 | \n",
+ " 7.186156 | \n",
+ " 0.993180 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 338 | \n",
+ " 13985 | \n",
+ " essex property trust incorporated | \n",
+ " essex property trust incorporated | \n",
+ " 2 | \n",
+ " 1.774257e+06 | \n",
+ " maryland | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.007786 | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " ESKS PRPRT TRST | \n",
+ " ESKS PRPRT TRST | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_l company_name_r gamma_company_name bf_company_name loc_of_incorporation_l loc_of_incorporation_r gamma_loc_of_incorporation tf_loc_of_incorporation_l tf_loc_of_incorporation_r bf_loc_of_incorporation bf_tf_adj_loc_of_incorporation company_name_mphone_l company_name_mphone_r\n",
+ "0 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 8180 159390 national instruments corporation national instruments corporation 2 1.774257e+06 delaware republic of korea 0 0.372842 0.000234 0.556230 1.000000 NXNL INSTRMNTS NXNL INSTRMNTS\n",
+ "1 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 7912 154757 enbridge incorporated enbridge incorporated 2 1.774257e+06 a0 alberta 0 0.000033 0.000880 0.556230 1.000000 ENBRJ ENBRJ\n",
+ "2 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 7557 140921 spectrum pharmaceuticals incorporated spectrum pharmaceuticals incorporated 2 1.774257e+06 delaware cayman islands 0 0.372842 0.015387 0.556230 1.000000 SPKTRM FRMSTKLS SPKTRM FRMSTKLS\n",
+ "3 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 8057 152329 american eagle outfitters incorporated american eagle outfitters incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 AMRKN EKL OTFTRS AMRKN EKL OTFTRS\n",
+ "4 14.126362 0.999944 __splink__input_table_0 __splink__input_table_1 7315 28974 pruco life insurance company pruco life insurance company 2 1.774257e+06 arizona arizona 2 0.004388 0.004388 2.487467 49.368830 PRK LF INSRNS PRK LF INSRNS\n",
+ "5 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 7419 142779 national presto industries incorporated national presto industries incorporated 2 1.774257e+06 wisconsin None -1 0.004110 NaN 1.000000 1.000000 NXNL PRST INTSTRS NXNL PRST INTSTRS\n",
+ "6 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 7387 142016 national bankshares incorporated national bankshares incorporated 2 1.774257e+06 virginia commonwealth virginia 0 0.006276 0.000022 0.556230 1.000000 NXNL BNKXRS NXNL BNKXRS\n",
+ "7 13.610142 0.999920 __splink__input_table_0 __splink__input_table_1 7387 127697 national bankshares incorporated national bankshares incorporated 2 1.774257e+06 virginia virginia 2 0.006276 0.006276 2.487467 34.518756 NXNL BNKXRS NXNL BNKXRS\n",
+ "8 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 8258 162906 thermo fisher scientific incorporated thermo fisher scientific incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 0RM FXR SSNTFK 0RM FXR SSNTFK\n",
+ "9 12.101855 0.999773 __splink__input_table_0 __splink__input_table_1 7428 60197 general motors financial company incorporated general motors financial company incorporated 2 1.774257e+06 texas texas 2 0.017854 0.017854 2.487467 12.134323 JNRL MTRS FNNXL JNRL MTRS FNNXL\n",
+ "10 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 8258 163501 thermo fisher scientific incorporated thermo fisher scientific incorporated 2 1.774257e+06 delaware mexico 0 0.372842 0.011205 0.556230 1.000000 0RM FXR SSNTFK 0RM FXR SSNTFK\n",
+ "11 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 5498 52885 apollo strategic growth capital ii apollo strategic growth capital ii 2 1.774257e+06 e9 cayman islands 0 0.001069 0.015387 0.556230 1.000000 APL STRTJK KR0 KPTL APL STRTJK KR0 KPTL \n",
+ "12 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 8258 162892 thermo fisher scientific incorporated thermo fisher scientific incorporated 2 1.774257e+06 delaware canada 0 0.372842 0.012191 0.556230 1.000000 0RM FXR SSNTFK 0RM FXR SSNTFK\n",
+ "13 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 8258 162847 thermo fisher scientific incorporated thermo fisher scientific incorporated 2 1.774257e+06 delaware russia 0 0.372842 0.001108 0.556230 1.000000 0RM FXR SSNTFK 0RM FXR SSNTFK\n",
+ "14 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 498 18301 intellinetics incorporated intellinetics incorporated 2 1.774257e+06 nevada ohio 0 0.014652 0.008136 0.556230 1.000000 INTLNTKS INTLNTKS\n",
+ "15 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1533 165897 high sierra technologies incorporated high sierra technologies incorporated 2 1.774257e+06 colorado nevada 0 0.004817 0.014652 0.556230 1.000000 H SR TXNLJS H SR TXNLJS\n",
+ "16 13.991858 0.999939 __splink__input_table_0 __splink__input_table_1 2127 61213 lnpr group incorporated lnpr group incorporated 2 1.774257e+06 colorado colorado 2 0.004817 0.004817 2.487467 44.974148 LNPR KRP LNPR KRP\n",
+ "17 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 93 1969 norwood financial corporation norwood financial corporation 2 1.774257e+06 pennsylvania None -1 0.007919 NaN 1.000000 1.000000 NRWT FNNXL NRWT FNNXL\n",
+ "18 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 151 2257 nov incorporated nov incorporated 2 1.774257e+06 delaware mauritius 0 0.372842 0.001075 0.556230 1.000000 NF NF\n",
+ "19 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 280 10975 juniper networks incorporated juniper networks incorporated 2 1.774257e+06 delaware california, usa 0 0.372842 0.000234 0.556230 1.000000 JNPR NTWRKS JNPR NTWRKS\n",
+ "20 3.252392 0.905028 __splink__input_table_0 __splink__input_table_1 1399 157790 logiq incorporated logiq3 incorporated 1 2.087284e+05 delaware canada 0 0.372842 0.012191 0.556230 1.000000 LJK LJK\n",
+ "21 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1720 166283 edgio incorporated edgio incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 EJ EJ\n",
+ "22 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 2020 184709 arem pacific corporation arem pacific corporation 2 1.774257e+06 delaware arizona 0 0.372842 0.004388 0.556230 1.000000 ARM PSFK ARM PSFK\n",
+ "23 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 756 26596 ensign group incorporated ensign group incorporated 2 1.774257e+06 None nevada -1 NaN 0.014652 1.000000 1.000000 ENSKN KRP ENSKN KRP\n",
+ "24 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 1104 24668 cco holdings limited liability company cco holdings limited liability company 2 1.774257e+06 None delaware -1 NaN 0.372842 1.000000 1.000000 KK HLTNKS KK HLTNKS\n",
+ "25 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 321 11011 pc connection incorporated pc connection incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 KNKXN KNKXN\n",
+ "26 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 477 14483 polarityte incorporated polarityte incorporated 2 1.774257e+06 delaware nevada 0 0.372842 0.014652 0.556230 1.000000 PLRTT PLRTT\n",
+ "27 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 810 25991 atlas air worldwide holdings incorporated atlas air worldwide holdings incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 ATLS AR WRLTWT HLTNKS ATLS AR WRLTWT HLTNKS\n",
+ "28 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1003 166010 spi energy co limited spi energy co limited 2 1.774257e+06 e9 cayman 0 0.001069 0.000345 0.556230 1.000000 SP ENRJ SP ENRJ\n",
+ "29 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1012 165926 bimi international medical incorporated bimi international medical incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 BM INTRNXNL MTKL BM INTRNXNL MTKL\n",
+ "30 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 1868 51876 phreesia incorporated phreesia incorporated 2 1.774257e+06 delaware None -1 0.372842 NaN 1.000000 1.000000 FRX FRX\n",
+ "31 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 2198 78290 secureworks corporation secureworks corporation 2 1.774257e+06 delaware united states 0 0.372842 0.012146 0.556230 1.000000 SKRWRKS SKRWRKS\n",
+ "32 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 2273 58771 ryerson holding corporation ryerson holding corporation 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 RYRSN HLTNK RYRSN HLTNK\n",
+ "33 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 221 9106 comfort systems usa incorporated comfort systems usa incorporated 2 1.774257e+06 None arkansas -1 NaN 0.001253 1.000000 1.000000 KMFRT SSTMS US KMFRT SSTMS US\n",
+ "34 14.351809 0.999952 __splink__input_table_0 __splink__input_table_1 478 180383 winnebago industries incorporated winnebago industries incorporated 2 1.774257e+06 minnesota minnesota 2 0.003754 0.003754 2.487467 57.719048 WNBK INTSTRS WNBK INTSTRS\n",
+ "35 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1913 166068 renewable energy acquisition corporation renewable energy acquisition corporation 2 1.774257e+06 nevada us 0 0.014652 0.000908 0.556230 1.000000 RNWBL ENRJ AKKSXN RNWBL ENRJ AKKSXN\n",
+ "36 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 257 164606 riverview bancorp incorporated riverview bancorp incorporated 2 1.774257e+06 washington None -1 0.002996 NaN 1.000000 1.000000 RFRF BNKRP RFRF BNKRP\n",
+ "37 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 294 182945 timberland bancorp incorporated timberland bancorp incorporated 2 1.774257e+06 washington None -1 0.002996 NaN 1.000000 1.000000 TMBRLNT BNKRP TMBRLNT BNKRP\n",
+ "38 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 415 18543 lkq corporation lkq corporation 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 LKK LKK\n",
+ "39 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 674 23252 berkshire hills bancorp incorporated berkshire hills bancorp incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 BRKXR HLS BNKRP BRKXR HLS BNKRP\n",
+ "40 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1270 181001 dolby laboratories incorporated dolby laboratories incorporated 2 1.774257e+06 delaware california 0 0.372842 0.015978 0.556230 1.000000 TLB LBRTRS TLB LBRTRS\n",
+ "41 3.252392 0.905028 __splink__input_table_0 __splink__input_table_1 1321 132984 tss incorporated dss incorporated 1 2.087284e+05 delaware new york 0 0.372842 0.009913 0.556230 1.000000 TS TS\n",
+ "42 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1482 46045 anywhere real estate incorporated anywhere real estate incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 ANHR RL ESTT ANHR RL ESTT\n",
+ "43 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1494 47625 kbr incorporated kbr incorporated 2 1.774257e+06 delaware united states 0 0.372842 0.012146 0.556230 1.000000 KBR KBR\n",
+ "44 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1972 166348 reshape lifesciences incorporated reshape lifesciences incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 RXP LFSSNSS RXP LFSSNSS\n",
+ "45 12.387018 0.999813 __splink__input_table_0 __splink__input_table_1 1457 172081 imperalis holding corporation imperalis holding corporation 2 1.774257e+06 nevada nevada 2 0.014652 0.014652 2.487467 14.786255 IMPRLS HLTNK IMPRLS HLTNK\n",
+ "46 12.387018 0.999813 __splink__input_table_0 __splink__input_table_1 2037 172091 bitnile metaverse incorporated bitnile metaverse incorporated 2 1.774257e+06 nevada nevada 2 0.014652 0.014652 2.487467 14.786255 BTNL MTFRS BTNL MTFRS\n",
+ "47 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1058 35808 qvc incorporated qvc incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 KFK KFK\n",
+ "48 9.692877 0.998793 __splink__input_table_0 __splink__input_table_1 1705 47703 irhythm technologies incorporated irhythm technologies incorporated 2 1.774257e+06 delaware us delaware 1 0.372842 0.000323 5.683268 1.000000 IRH0M TXNLJS IRH0M TXNLJS\n",
+ "49 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 338 13985 essex property trust incorporated essex property trust incorporated 2 1.774257e+06 maryland None -1 0.007786 NaN 1.000000 1.000000 ESKS PRPRT TRST ESKS PRPRT TRST"
+ ]
+ },
+ "execution_count": 109,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preds_df[preds_df.match_probability > .9]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "id": "8e658c36-7b6f-480f-9d74-37af9510ebe2",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_probability | \n",
+ " company_name_l | \n",
+ " company_name_r | \n",
+ " loc_list_l | \n",
+ " loc_list_r | \n",
+ " company_name_mphone_l | \n",
+ " company_name_mphone_r | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 465 | \n",
+ " 0.914612 | \n",
+ " conns incorporated | \n",
+ " invenco incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " KNS | \n",
+ " INFNK | \n",
+ "
\n",
+ " \n",
+ " 466 | \n",
+ " 0.914612 | \n",
+ " vishay intertechnology incorporated | \n",
+ " vishay precision foil, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " FX INTRTXNLJ | \n",
+ " FX PRSXN FL | \n",
+ "
\n",
+ " \n",
+ " 467 | \n",
+ " 0.980607 | \n",
+ " vishay precision group, incorporated | \n",
+ " vishay precision foil, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " FX PRSXN KRP | \n",
+ " FX PRSXN FL | \n",
+ "
\n",
+ " \n",
+ " 470 | \n",
+ " 0.975104 | \n",
+ " jones lang lasalle incorporated | \n",
+ " jones lang lasalle limited | \n",
+ " [maryland] | \n",
+ " [hong, kong] | \n",
+ " JNS LNK LSL | \n",
+ " JNS LNK LSL | \n",
+ "
\n",
+ " \n",
+ " 471 | \n",
+ " 0.951657 | \n",
+ " nrg energy, incorporated | \n",
+ " nrg energy, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " NRK ENRJ | \n",
+ " NRK ENRJ | \n",
+ "
\n",
+ " \n",
+ " 472 | \n",
+ " 0.914612 | \n",
+ " firstenergy corporation | \n",
+ " firstenergy ventures corporation | \n",
+ " [ohio] | \n",
+ " [ohio] | \n",
+ " FRSTNRJ | \n",
+ " FRSTNRJ FNTRS | \n",
+ "
\n",
+ " \n",
+ " 478 | \n",
+ " 0.914612 | \n",
+ " hudson pacific properties, incorporated | \n",
+ " hudson pacific services, incorporated | \n",
+ " [maryland] | \n",
+ " [maryland] | \n",
+ " HTSN PSFK PRPRTS | \n",
+ " HTSN PSFK SRFSS | \n",
+ "
\n",
+ " \n",
+ " 479 | \n",
+ " 0.980607 | \n",
+ " hudson pacific properties, incorporated | \n",
+ " hudson pacific properties, limited partnership | \n",
+ " [maryland] | \n",
+ " [maryland] | \n",
+ " HTSN PSFK PRPRTS | \n",
+ " HTSN PSFK PRPRTS | \n",
+ "
\n",
+ " \n",
+ " 481 | \n",
+ " 0.914612 | \n",
+ " digital ally, incorporated | \n",
+ " digital ally international, incorporated | \n",
+ " [nevada] | \n",
+ " [nevada] | \n",
+ " TJTL AL | \n",
+ " TJTL AL INTRNXNL | \n",
+ "
\n",
+ " \n",
+ " 489 | \n",
+ " 0.976947 | \n",
+ " cco holdings limited liability company | \n",
+ " rhfw holdings, limited liability company | \n",
+ " NaN | \n",
+ " [delaware] | \n",
+ " KK HLTNKS | \n",
+ " RHF HLTNKS | \n",
+ "
\n",
+ " \n",
+ " 493 | \n",
+ " 0.975104 | \n",
+ " intuitive surgical incorporated | \n",
+ " intuitive surgical limited | \n",
+ " [delaware] | \n",
+ " [united, kingdom] | \n",
+ " INTTF SRJKL | \n",
+ " INTTF SRJKL | \n",
+ "
\n",
+ " \n",
+ " 494 | \n",
+ " 0.975104 | \n",
+ " jones lang lasalle incorporated | \n",
+ " jones lang lasalle limited | \n",
+ " [maryland] | \n",
+ " [england] | \n",
+ " JNS LNK LSL | \n",
+ " JNS LNK LSL | \n",
+ "
\n",
+ " \n",
+ " 500 | \n",
+ " 0.975104 | \n",
+ " becton dickinson and company | \n",
+ " becton, dickinson and company, limited | \n",
+ " [new, jersey] | \n",
+ " [ireland] | \n",
+ " BKTN TKNSN ANT | \n",
+ " BKTN TKNSN ANT | \n",
+ "
\n",
+ " \n",
+ " 501 | \n",
+ " 0.975104 | \n",
+ " united parcel service incorporated | \n",
+ " united guaranty services, incorporated | \n",
+ " [delaware] | \n",
+ " [north, carolina] | \n",
+ " UNTT PRSL SRFS | \n",
+ " UNTT KRNT SRFSS | \n",
+ "
\n",
+ " \n",
+ " 509 | \n",
+ " 0.914612 | \n",
+ " estee lauder companies incorporated | \n",
+ " estee lauder international, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " EST LTR KMPNS | \n",
+ " EST LTR INTRNXNL | \n",
+ "
\n",
+ " \n",
+ " 510 | \n",
+ " 0.914612 | \n",
+ " maxcyte, incorporated | \n",
+ " cues, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " MKSST | \n",
+ " KS | \n",
+ "
\n",
+ " \n",
+ " 515 | \n",
+ " 0.980607 | \n",
+ " zimmer biomet holdings, incorporated | \n",
+ " zimmer biomet spine, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " SMR BMT HLTNKS | \n",
+ " SMR BMT SPN | \n",
+ "
\n",
+ " \n",
+ " 518 | \n",
+ " 0.914612 | \n",
+ " nordicus partners corporation | \n",
+ " nordco enterprises, incorporated | \n",
+ " [delaware] | \n",
+ " [wilmington, delaware] | \n",
+ " NRTKS PRTNRS | \n",
+ " NRTK ENTRPRSS | \n",
+ "
\n",
+ " \n",
+ " 519 | \n",
+ " 0.975104 | \n",
+ " valero energy corp/tx | \n",
+ " valero energy incorporated | \n",
+ " [delaware] | \n",
+ " [canada] | \n",
+ " FLR ENRJ TKS | \n",
+ " FLR ENRJ | \n",
+ "
\n",
+ " \n",
+ " 527 | \n",
+ " 0.914612 | \n",
+ " nrg energy, incorporated | \n",
+ " nrg energy holdings incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " NRK ENRJ | \n",
+ " NRK ENRJ HLTNKS | \n",
+ "
\n",
+ " \n",
+ " 528 | \n",
+ " 0.914612 | \n",
+ " everi holdings incorporated | \n",
+ " edi holdings, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " EFR HLTNKS | \n",
+ " ET HLTNKS | \n",
+ "
\n",
+ " \n",
+ " 535 | \n",
+ " 0.914612 | \n",
+ " estee lauder companies incorporated | \n",
+ " estee lauder incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " EST LTR KMPNS | \n",
+ " EST LTR | \n",
+ "
\n",
+ " \n",
+ " 548 | \n",
+ " 0.975104 | \n",
+ " universal logistics holdings, incorporated | \n",
+ " universal logistics corporation | \n",
+ " [michigan] | \n",
+ " [florida] | \n",
+ " UNFRSL LJSTKS HLTNKS | \n",
+ " UNFRSL LJSTKS | \n",
+ "
\n",
+ " \n",
+ " 551 | \n",
+ " 0.975104 | \n",
+ " alliant energy corporation | \n",
+ " allergan gi corporation | \n",
+ " [wisconsin] | \n",
+ " [delaware] | \n",
+ " ALNT ENRJ | \n",
+ " ALRKN J | \n",
+ "
\n",
+ " \n",
+ " 555 | \n",
+ " 0.975104 | \n",
+ " smartmetric, incorporated | \n",
+ " smartpetro incorporated | \n",
+ " [nevada] | \n",
+ " [philippines] | \n",
+ " SMRTMTRK | \n",
+ " SMRTPTR | \n",
+ "
\n",
+ " \n",
+ " 566 | \n",
+ " 0.914612 | \n",
+ " republic services, incorporated | \n",
+ " republic conduit, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " RPBLK SRFSS | \n",
+ " RPBLK KNTT | \n",
+ "
\n",
+ " \n",
+ " 571 | \n",
+ " 0.975104 | \n",
+ " freedom holdings, incorporated | \n",
+ " freedom designs, incorporated | \n",
+ " [maryland] | \n",
+ " [california] | \n",
+ " FRTM HLTNKS | \n",
+ " FRTM TSKNS | \n",
+ "
\n",
+ " \n",
+ " 573 | \n",
+ " 0.938457 | \n",
+ " ares real estate income trust incorporated | \n",
+ " ares real estate income trust incorporated | \n",
+ " [maryland] | \n",
+ " [delaware] | \n",
+ " ARS RL ESTT INKM TRST | \n",
+ " ARS RL ESTT INKM TRST | \n",
+ "
\n",
+ " \n",
+ " 574 | \n",
+ " 0.975104 | \n",
+ " bank of new york mellon corporation | \n",
+ " bank of new york mellon sa/nv | \n",
+ " [delaware] | \n",
+ " [belgium] | \n",
+ " BNK OF N YRK MLN | \n",
+ " BNK OF N YRK MLN SNF | \n",
+ "
\n",
+ " \n",
+ " 576 | \n",
+ " 0.914612 | \n",
+ " southern company | \n",
+ " southern wood piedmont company | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " S0RN | \n",
+ " S0RN WT PTMNT | \n",
+ "
\n",
+ " \n",
+ " 582 | \n",
+ " 0.914612 | \n",
+ " ameresco, incorporated | \n",
+ " ameripath, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " AMRSK | \n",
+ " AMRP0 | \n",
+ "
\n",
+ " \n",
+ " 584 | \n",
+ " 0.914612 | \n",
+ " trevena incorporated | \n",
+ " anr, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " TRFN | \n",
+ " ANR | \n",
+ "
\n",
+ " \n",
+ " 590 | \n",
+ " 0.975104 | \n",
+ " bank of new york mellon corporation | \n",
+ " bank of new york mellon | \n",
+ " [delaware] | \n",
+ " [new, york] | \n",
+ " BNK OF N YRK MLN | \n",
+ " BNK OF N YRK MLN | \n",
+ "
\n",
+ " \n",
+ " 591 | \n",
+ " 0.938457 | \n",
+ " xerox holdings corporation | \n",
+ " xerox holdings corporation | \n",
+ " [connecticut] | \n",
+ " [new, york] | \n",
+ " SRKS HLTNKS | \n",
+ " SRKS HLTNKS | \n",
+ "
\n",
+ " \n",
+ " 594 | \n",
+ " 0.975104 | \n",
+ " jones lang lasalle incorporated | \n",
+ " jones lang lasalle ip, incorporated | \n",
+ " [maryland] | \n",
+ " [delaware] | \n",
+ " JNS LNK LSL | \n",
+ " JNS LNK LSL IP | \n",
+ "
\n",
+ " \n",
+ " 595 | \n",
+ " 0.914612 | \n",
+ " iron mountain incorporated | \n",
+ " iron mountain global holdings, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " IRN MNTN | \n",
+ " IRN MNTN KLBL HLTNKS | \n",
+ "
\n",
+ " \n",
+ " 597 | \n",
+ " 0.980607 | \n",
+ " extreme networks incorporated | \n",
+ " extreme networks ihc, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " EKSTRM NTWRKS | \n",
+ " EKSTRM NTWRKS IK | \n",
+ "
\n",
+ " \n",
+ " 599 | \n",
+ " 0.976947 | \n",
+ " q2 holdings, incorporated | \n",
+ " vr holdings, incorporated | \n",
+ " NaN | \n",
+ " [colorado] | \n",
+ " K HLTNKS | \n",
+ " FR HLTNKS | \n",
+ "
\n",
+ " \n",
+ " 600 | \n",
+ " 0.980607 | \n",
+ " extreme networks incorporated | \n",
+ " extreme networks, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " EKSTRM NTWRKS | \n",
+ " EKSTRM NTWRKS | \n",
+ "
\n",
+ " \n",
+ " 604 | \n",
+ " 0.914612 | \n",
+ " cutera incorporated | \n",
+ " vrec, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " KTR | \n",
+ " FRK | \n",
+ "
\n",
+ " \n",
+ " 605 | \n",
+ " 0.975104 | \n",
+ " assured guaranty limited | \n",
+ " assured guaranty services limited | \n",
+ " [d0] | \n",
+ " [england] | \n",
+ " ASRT KRNT | \n",
+ " ASRT KRNT SRFSS | \n",
+ "
\n",
+ " \n",
+ " 606 | \n",
+ " 0.976947 | \n",
+ " virtra, incorporated | \n",
+ " viator, incorporated | \n",
+ " [nevada] | \n",
+ " NaN | \n",
+ " FRTR | \n",
+ " FTR | \n",
+ "
\n",
+ " \n",
+ " 618 | \n",
+ " 0.975104 | \n",
+ " sculptor capital management, incorporated | \n",
+ " sculptor capital management hong kong limited | \n",
+ " [delaware] | \n",
+ " [hong, kong] | \n",
+ " SKLPTR KPTL MNJMNT | \n",
+ " SKLPTR KPTL MNJMNT HNK KNK | \n",
+ "
\n",
+ " \n",
+ " 625 | \n",
+ " 0.975104 | \n",
+ " enstar group limited | \n",
+ " enstar limited | \n",
+ " [d0] | \n",
+ " [bermuda] | \n",
+ " ENSTR KRP | \n",
+ " ENSTR | \n",
+ "
\n",
+ " \n",
+ " 626 | \n",
+ " 0.975104 | \n",
+ " sellas life sciences group, incorporated | \n",
+ " sellas life sciences group limited | \n",
+ " [delaware] | \n",
+ " [bermuda] | \n",
+ " SLS LF SSNSS KRP | \n",
+ " SLS LF SSNSS KRP | \n",
+ "
\n",
+ " \n",
+ " 627 | \n",
+ " 0.975104 | \n",
+ " intuitive surgical incorporated | \n",
+ " intuitive surgical canada incorporated | \n",
+ " [delaware] | \n",
+ " [canada] | \n",
+ " INTTF SRJKL | \n",
+ " INTTF SRJKL KNT | \n",
+ "
\n",
+ " \n",
+ " 630 | \n",
+ " 0.951657 | \n",
+ " forestar group incorporated | \n",
+ " forestar group incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " FRSTR KRP | \n",
+ " FRSTR KRP | \n",
+ "
\n",
+ " \n",
+ " 637 | \n",
+ " 0.914612 | \n",
+ " dcp midstream, limited partnership | \n",
+ " dcp midstream operating, limited partnership | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " TKP MTSTRM | \n",
+ " TKP MTSTRM OPRTNK | \n",
+ "
\n",
+ " \n",
+ " 639 | \n",
+ " 0.951657 | \n",
+ " equitable holdings, incorporated | \n",
+ " equitable holdings, incorporated | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " EKTBL HLTNKS | \n",
+ " EKTBL HLTNKS | \n",
+ "
\n",
+ " \n",
+ " 643 | \n",
+ " 0.914612 | \n",
+ " energy transfer limited partnership | \n",
+ " energy transfer partners, limited liability co... | \n",
+ " [delaware] | \n",
+ " [delaware] | \n",
+ " ENRJ TRNSFR | \n",
+ " ENRJ TRNSFR PRTNRS | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " match_probability company_name_l company_name_r loc_list_l loc_list_r company_name_mphone_l company_name_mphone_r\n",
+ "465 0.914612 conns incorporated invenco incorporated [delaware] [delaware] KNS INFNK\n",
+ "466 0.914612 vishay intertechnology incorporated vishay precision foil, incorporated [delaware] [delaware] FX INTRTXNLJ FX PRSXN FL\n",
+ "467 0.980607 vishay precision group, incorporated vishay precision foil, incorporated [delaware] [delaware] FX PRSXN KRP FX PRSXN FL\n",
+ "470 0.975104 jones lang lasalle incorporated jones lang lasalle limited [maryland] [hong, kong] JNS LNK LSL JNS LNK LSL\n",
+ "471 0.951657 nrg energy, incorporated nrg energy, incorporated [delaware] [delaware] NRK ENRJ NRK ENRJ\n",
+ "472 0.914612 firstenergy corporation firstenergy ventures corporation [ohio] [ohio] FRSTNRJ FRSTNRJ FNTRS\n",
+ "478 0.914612 hudson pacific properties, incorporated hudson pacific services, incorporated [maryland] [maryland] HTSN PSFK PRPRTS HTSN PSFK SRFSS\n",
+ "479 0.980607 hudson pacific properties, incorporated hudson pacific properties, limited partnership [maryland] [maryland] HTSN PSFK PRPRTS HTSN PSFK PRPRTS\n",
+ "481 0.914612 digital ally, incorporated digital ally international, incorporated [nevada] [nevada] TJTL AL TJTL AL INTRNXNL\n",
+ "489 0.976947 cco holdings limited liability company rhfw holdings, limited liability company NaN [delaware] KK HLTNKS RHF HLTNKS\n",
+ "493 0.975104 intuitive surgical incorporated intuitive surgical limited [delaware] [united, kingdom] INTTF SRJKL INTTF SRJKL\n",
+ "494 0.975104 jones lang lasalle incorporated jones lang lasalle limited [maryland] [england] JNS LNK LSL JNS LNK LSL\n",
+ "500 0.975104 becton dickinson and company becton, dickinson and company, limited [new, jersey] [ireland] BKTN TKNSN ANT BKTN TKNSN ANT\n",
+ "501 0.975104 united parcel service incorporated united guaranty services, incorporated [delaware] [north, carolina] UNTT PRSL SRFS UNTT KRNT SRFSS\n",
+ "509 0.914612 estee lauder companies incorporated estee lauder international, incorporated [delaware] [delaware] EST LTR KMPNS EST LTR INTRNXNL\n",
+ "510 0.914612 maxcyte, incorporated cues, incorporated [delaware] [delaware] MKSST KS\n",
+ "515 0.980607 zimmer biomet holdings, incorporated zimmer biomet spine, incorporated [delaware] [delaware] SMR BMT HLTNKS SMR BMT SPN\n",
+ "518 0.914612 nordicus partners corporation nordco enterprises, incorporated [delaware] [wilmington, delaware] NRTKS PRTNRS NRTK ENTRPRSS\n",
+ "519 0.975104 valero energy corp/tx valero energy incorporated [delaware] [canada] FLR ENRJ TKS FLR ENRJ\n",
+ "527 0.914612 nrg energy, incorporated nrg energy holdings incorporated [delaware] [delaware] NRK ENRJ NRK ENRJ HLTNKS\n",
+ "528 0.914612 everi holdings incorporated edi holdings, incorporated [delaware] [delaware] EFR HLTNKS ET HLTNKS\n",
+ "535 0.914612 estee lauder companies incorporated estee lauder incorporated [delaware] [delaware] EST LTR KMPNS EST LTR\n",
+ "548 0.975104 universal logistics holdings, incorporated universal logistics corporation [michigan] [florida] UNFRSL LJSTKS HLTNKS UNFRSL LJSTKS\n",
+ "551 0.975104 alliant energy corporation allergan gi corporation [wisconsin] [delaware] ALNT ENRJ ALRKN J\n",
+ "555 0.975104 smartmetric, incorporated smartpetro incorporated [nevada] [philippines] SMRTMTRK SMRTPTR\n",
+ "566 0.914612 republic services, incorporated republic conduit, incorporated [delaware] [delaware] RPBLK SRFSS RPBLK KNTT\n",
+ "571 0.975104 freedom holdings, incorporated freedom designs, incorporated [maryland] [california] FRTM HLTNKS FRTM TSKNS\n",
+ "573 0.938457 ares real estate income trust incorporated ares real estate income trust incorporated [maryland] [delaware] ARS RL ESTT INKM TRST ARS RL ESTT INKM TRST\n",
+ "574 0.975104 bank of new york mellon corporation bank of new york mellon sa/nv [delaware] [belgium] BNK OF N YRK MLN BNK OF N YRK MLN SNF\n",
+ "576 0.914612 southern company southern wood piedmont company [delaware] [delaware] S0RN S0RN WT PTMNT\n",
+ "582 0.914612 ameresco, incorporated ameripath, incorporated [delaware] [delaware] AMRSK AMRP0\n",
+ "584 0.914612 trevena incorporated anr, incorporated [delaware] [delaware] TRFN ANR\n",
+ "590 0.975104 bank of new york mellon corporation bank of new york mellon [delaware] [new, york] BNK OF N YRK MLN BNK OF N YRK MLN\n",
+ "591 0.938457 xerox holdings corporation xerox holdings corporation [connecticut] [new, york] SRKS HLTNKS SRKS HLTNKS\n",
+ "594 0.975104 jones lang lasalle incorporated jones lang lasalle ip, incorporated [maryland] [delaware] JNS LNK LSL JNS LNK LSL IP\n",
+ "595 0.914612 iron mountain incorporated iron mountain global holdings, incorporated [delaware] [delaware] IRN MNTN IRN MNTN KLBL HLTNKS\n",
+ "597 0.980607 extreme networks incorporated extreme networks ihc, incorporated [delaware] [delaware] EKSTRM NTWRKS EKSTRM NTWRKS IK\n",
+ "599 0.976947 q2 holdings, incorporated vr holdings, incorporated NaN [colorado] K HLTNKS FR HLTNKS\n",
+ "600 0.980607 extreme networks incorporated extreme networks, incorporated [delaware] [delaware] EKSTRM NTWRKS EKSTRM NTWRKS\n",
+ "604 0.914612 cutera incorporated vrec, incorporated [delaware] [delaware] KTR FRK\n",
+ "605 0.975104 assured guaranty limited assured guaranty services limited [d0] [england] ASRT KRNT ASRT KRNT SRFSS\n",
+ "606 0.976947 virtra, incorporated viator, incorporated [nevada] NaN FRTR FTR\n",
+ "618 0.975104 sculptor capital management, incorporated sculptor capital management hong kong limited [delaware] [hong, kong] SKLPTR KPTL MNJMNT SKLPTR KPTL MNJMNT HNK KNK\n",
+ "625 0.975104 enstar group limited enstar limited [d0] [bermuda] ENSTR KRP ENSTR\n",
+ "626 0.975104 sellas life sciences group, incorporated sellas life sciences group limited [delaware] [bermuda] SLS LF SSNSS KRP SLS LF SSNSS KRP\n",
+ "627 0.975104 intuitive surgical incorporated intuitive surgical canada incorporated [delaware] [canada] INTTF SRJKL INTTF SRJKL KNT\n",
+ "630 0.951657 forestar group incorporated forestar group incorporated [delaware] [delaware] FRSTR KRP FRSTR KRP\n",
+ "637 0.914612 dcp midstream, limited partnership dcp midstream operating, limited partnership [delaware] [delaware] TKP MTSTRM TKP MTSTRM OPRTNK\n",
+ "639 0.951657 equitable holdings, incorporated equitable holdings, incorporated [delaware] [delaware] EKTBL HLTNKS EKTBL HLTNKS\n",
+ "643 0.914612 energy transfer limited partnership energy transfer partners, limited liability co... [delaware] [delaware] ENRJ TRNSFR ENRJ TRNSFR PRTNRS"
+ ]
+ },
+ "execution_count": 79,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preds_df[preds_df.match_probability >= .9][[\"match_probability\", \"company_name_l\", \"company_name_r\", \"loc_list_l\", \"loc_list_r\", \"company_name_mphone_l\", \"company_name_mphone_r\"]].iloc[150:200]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fb2122d8-ff0a-4117-a91c-17a0523dcfcb",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "mozilla_sec_eia",
+ "language": "python",
+ "name": "mozilla_sec_eia"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/17-kl-paragraph-layout-metrics.ipynb b/notebooks/17-kl-paragraph-layout-metrics.ipynb
new file mode 100644
index 0000000..f7c3a8d
--- /dev/null
+++ b/notebooks/17-kl-paragraph-layout-metrics.ipynb
@@ -0,0 +1,687 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "748b07d1-61ac-43b8-bff9-9f660626da1b",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "%load_ext autoreload\n",
+ "%autoreload 3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "bb513a3e-31f7-49da-895b-e3ed4f52efd4",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from pathlib import Path\n",
+ "\n",
+ "import pandas as pd\n",
+ "\n",
+ "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "29c9b2e0-7f2f-4ab7-9972-f1ed30ff196a",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "archive = GCSArchive()\n",
+ "md = archive.get_metadata()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "1608bf1e-d6cf-4e3a-8f69-0e62744d0dfd",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " cik | \n",
+ " company_name | \n",
+ " form_type | \n",
+ " date_filed | \n",
+ " exhibit_21_version | \n",
+ " year_quarter | \n",
+ "
\n",
+ " \n",
+ " filename | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " edgar/data/17206/0000017206-94-000007.txt | \n",
+ " 17206 | \n",
+ " CAPITAL HOLDING CORP | \n",
+ " 10-K/A | \n",
+ " 1993-12-22 | \n",
+ " None | \n",
+ " 1993q4 | \n",
+ "
\n",
+ " \n",
+ " edgar/data/29082/0000950131-94-000021.txt | \n",
+ " 29082 | \n",
+ " DISNEY WALT CO | \n",
+ " 10-K | \n",
+ " 1993-12-22 | \n",
+ " 21 | \n",
+ " 1993q4 | \n",
+ "
\n",
+ " \n",
+ " edgar/data/32377/0000032377-94-000001.txt | \n",
+ " 32377 | \n",
+ " ELIZABETHTOWN GAS CO | \n",
+ " 10-K | \n",
+ " 1993-12-13 | \n",
+ " 21 | \n",
+ " 1993q4 | \n",
+ "
\n",
+ " \n",
+ " edgar/data/353944/0000353944-94-000005.txt | \n",
+ " 353944 | \n",
+ " INTERNATIONAL GAME TECHNOLOGY | \n",
+ " 10-K | \n",
+ " 1993-12-23 | \n",
+ " 21 | \n",
+ " 1993q4 | \n",
+ "
\n",
+ " \n",
+ " edgar/data/60512/0000060512-94-000006.txt | \n",
+ " 60512 | \n",
+ " LOUISIANA LAND & EXPLORATION CO | \n",
+ " 10-K/A | \n",
+ " 1993-10-07 | \n",
+ " None | \n",
+ " 1993q4 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " edgar/data/932021/0001493152-23-046428.txt | \n",
+ " 932021 | \n",
+ " GLOBAL TECHNOLOGIES LTD | \n",
+ " 10-K | \n",
+ " 2023-12-29 | \n",
+ " 21.1 | \n",
+ " 2023q4 | \n",
+ "
\n",
+ " \n",
+ " edgar/data/933974/0001558370-23-019262.txt | \n",
+ " 933974 | \n",
+ " Azenta, Inc. | \n",
+ " 10-K | \n",
+ " 2023-11-21 | \n",
+ " 21.0 | \n",
+ " 2023q4 | \n",
+ "
\n",
+ " \n",
+ " edgar/data/935419/0001628280-23-041580.txt | \n",
+ " 935419 | \n",
+ " RCI HOSPITALITY HOLDINGS, INC. | \n",
+ " 10-K | \n",
+ " 2023-12-14 | \n",
+ " 21.1 | \n",
+ " 2023q4 | \n",
+ "
\n",
+ " \n",
+ " edgar/data/936395/0000936395-23-000044.txt | \n",
+ " 936395 | \n",
+ " CIENA CORP | \n",
+ " 10-K | \n",
+ " 2023-12-15 | \n",
+ " 21.1 | \n",
+ " 2023q4 | \n",
+ "
\n",
+ " \n",
+ " edgar/data/936528/0000936528-23-000207.txt | \n",
+ " 936528 | \n",
+ " WAFD INC | \n",
+ " 10-K | \n",
+ " 2023-11-17 | \n",
+ " None | \n",
+ " 2023q4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
290379 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " cik \\\n",
+ "filename \n",
+ "edgar/data/17206/0000017206-94-000007.txt 17206 \n",
+ "edgar/data/29082/0000950131-94-000021.txt 29082 \n",
+ "edgar/data/32377/0000032377-94-000001.txt 32377 \n",
+ "edgar/data/353944/0000353944-94-000005.txt 353944 \n",
+ "edgar/data/60512/0000060512-94-000006.txt 60512 \n",
+ "... ... \n",
+ "edgar/data/932021/0001493152-23-046428.txt 932021 \n",
+ "edgar/data/933974/0001558370-23-019262.txt 933974 \n",
+ "edgar/data/935419/0001628280-23-041580.txt 935419 \n",
+ "edgar/data/936395/0000936395-23-000044.txt 936395 \n",
+ "edgar/data/936528/0000936528-23-000207.txt 936528 \n",
+ "\n",
+ " company_name \\\n",
+ "filename \n",
+ "edgar/data/17206/0000017206-94-000007.txt CAPITAL HOLDING CORP \n",
+ "edgar/data/29082/0000950131-94-000021.txt DISNEY WALT CO \n",
+ "edgar/data/32377/0000032377-94-000001.txt ELIZABETHTOWN GAS CO \n",
+ "edgar/data/353944/0000353944-94-000005.txt INTERNATIONAL GAME TECHNOLOGY \n",
+ "edgar/data/60512/0000060512-94-000006.txt LOUISIANA LAND & EXPLORATION CO \n",
+ "... ... \n",
+ "edgar/data/932021/0001493152-23-046428.txt GLOBAL TECHNOLOGIES LTD \n",
+ "edgar/data/933974/0001558370-23-019262.txt Azenta, Inc. \n",
+ "edgar/data/935419/0001628280-23-041580.txt RCI HOSPITALITY HOLDINGS, INC. \n",
+ "edgar/data/936395/0000936395-23-000044.txt CIENA CORP \n",
+ "edgar/data/936528/0000936528-23-000207.txt WAFD INC \n",
+ "\n",
+ " form_type date_filed \\\n",
+ "filename \n",
+ "edgar/data/17206/0000017206-94-000007.txt 10-K/A 1993-12-22 \n",
+ "edgar/data/29082/0000950131-94-000021.txt 10-K 1993-12-22 \n",
+ "edgar/data/32377/0000032377-94-000001.txt 10-K 1993-12-13 \n",
+ "edgar/data/353944/0000353944-94-000005.txt 10-K 1993-12-23 \n",
+ "edgar/data/60512/0000060512-94-000006.txt 10-K/A 1993-10-07 \n",
+ "... ... ... \n",
+ "edgar/data/932021/0001493152-23-046428.txt 10-K 2023-12-29 \n",
+ "edgar/data/933974/0001558370-23-019262.txt 10-K 2023-11-21 \n",
+ "edgar/data/935419/0001628280-23-041580.txt 10-K 2023-12-14 \n",
+ "edgar/data/936395/0000936395-23-000044.txt 10-K 2023-12-15 \n",
+ "edgar/data/936528/0000936528-23-000207.txt 10-K 2023-11-17 \n",
+ "\n",
+ " exhibit_21_version year_quarter \n",
+ "filename \n",
+ "edgar/data/17206/0000017206-94-000007.txt None 1993q4 \n",
+ "edgar/data/29082/0000950131-94-000021.txt 21 1993q4 \n",
+ "edgar/data/32377/0000032377-94-000001.txt 21 1993q4 \n",
+ "edgar/data/353944/0000353944-94-000005.txt 21 1993q4 \n",
+ "edgar/data/60512/0000060512-94-000006.txt None 1993q4 \n",
+ "... ... ... \n",
+ "edgar/data/932021/0001493152-23-046428.txt 21.1 2023q4 \n",
+ "edgar/data/933974/0001558370-23-019262.txt 21.0 2023q4 \n",
+ "edgar/data/935419/0001628280-23-041580.txt 21.1 2023q4 \n",
+ "edgar/data/936395/0000936395-23-000044.txt 21.1 2023q4 \n",
+ "edgar/data/936528/0000936528-23-000207.txt None 2023q4 \n",
+ "\n",
+ "[290379 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "md"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "bb94754e-3765-43f2-a5e1-8b55a4021da4",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "df = pd.DataFrame()\n",
+ "dir_name = Path(\"paragraph_layout_md\")\n",
+ "for filename in os.listdir(dir_name):\n",
+ " if filename.split(\".\")[-1] != \"parquet\":\n",
+ " continue\n",
+ " yq_df = pd.read_parquet(dir_name / filename)\n",
+ " df = pd.concat([df, yq_df])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "52828dfa-a951-4bc5-88a1-f8c2dca2628b",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " paragraph | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1011174-0001193125-10-030674 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 1010612-0000950123-10-019499 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 1003410-0001193125-10-046549 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 1011308-0000921895-10-000357 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 1009672-0000950123-10-018301 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 898293-0000950144-04-010550 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 894490-0001193125-04-212822 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 930803-0000950136-04-004585 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 893430-0001193125-04-212647 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 920354-0000950135-04-005647 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
98712 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " paragraph\n",
+ "1011174-0001193125-10-030674 False\n",
+ "1010612-0000950123-10-019499 False\n",
+ "1003410-0001193125-10-046549 True\n",
+ "1011308-0000921895-10-000357 True\n",
+ "1009672-0000950123-10-018301 True\n",
+ "... ...\n",
+ "898293-0000950144-04-010550 False\n",
+ "894490-0001193125-04-212822 False\n",
+ "930803-0000950136-04-004585 False\n",
+ "893430-0001193125-04-212647 False\n",
+ "920354-0000950135-04-005647 True\n",
+ "\n",
+ "[98712 rows x 1 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "94b2ecbc-1e08-4b3a-835f-a10327f88298",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "df.loc[:, \"full_filename\"] = \"edgar/data/\" + df.index.str.replace('-', '/', n=1) + \".txt\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "b9c56e81-3e98-44bf-8c70-256ce1d58d80",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "md[\"date_filed\"] = md[\"date_filed\"].astype(\"datetime64[ns]\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "d60efebc-72ff-41e8-b765-8edcadbe185e",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " paragraph | \n",
+ " full_filename | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1011174-0001193125-10-030674 | \n",
+ " False | \n",
+ " edgar/data/1011174/0001193125-10-030674.txt | \n",
+ "
\n",
+ " \n",
+ " 1010612-0000950123-10-019499 | \n",
+ " False | \n",
+ " edgar/data/1010612/0000950123-10-019499.txt | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " paragraph \\\n",
+ "1011174-0001193125-10-030674 False \n",
+ "1010612-0000950123-10-019499 False \n",
+ "\n",
+ " full_filename \n",
+ "1011174-0001193125-10-030674 edgar/data/1011174/0001193125-10-030674.txt \n",
+ "1010612-0000950123-10-019499 edgar/data/1010612/0000950123-10-019499.txt "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "0f6d512f-b07a-4204-b3cf-69e08848ef2d",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.27785882162249775"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# what percentage of files are paragraph layout?\n",
+ "md_merged = md.reset_index().merge(df, left_on=\"filename\", right_on=\"full_filename\", how=\"left\", validate=\"1:1\")\n",
+ "md_merged = md_merged.dropna(subset=\"paragraph\")\n",
+ "len(md_merged[md_merged.paragraph])/len(md_merged)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "67e63df0-ca52-4eef-b6aa-a1715f1ab081",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " filename | \n",
+ " cik | \n",
+ " company_name | \n",
+ " form_type | \n",
+ " date_filed | \n",
+ " exhibit_21_version | \n",
+ " year_quarter | \n",
+ " paragraph | \n",
+ " full_filename | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 6 | \n",
+ " edgar/data/100240/0000950144-94-000787.txt | \n",
+ " 100240 | \n",
+ " TURNER BROADCASTING SYSTEM INC | \n",
+ " 10-K | \n",
+ " 1994-03-31 | \n",
+ " 21 | \n",
+ " 1994q1 | \n",
+ " False | \n",
+ " edgar/data/100240/0000950144-94-000787.txt | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " edgar/data/100885/0000100885-94-000006.txt | \n",
+ " 100885 | \n",
+ " UNION PACIFIC CORP | \n",
+ " 10-K | \n",
+ " 1994-03-29 | \n",
+ " 21 | \n",
+ " 1994q1 | \n",
+ " False | \n",
+ " edgar/data/100885/0000100885-94-000006.txt | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " filename cik \\\n",
+ "6 edgar/data/100240/0000950144-94-000787.txt 100240 \n",
+ "11 edgar/data/100885/0000100885-94-000006.txt 100885 \n",
+ "\n",
+ " company_name form_type date_filed exhibit_21_version \\\n",
+ "6 TURNER BROADCASTING SYSTEM INC 10-K 1994-03-31 21 \n",
+ "11 UNION PACIFIC CORP 10-K 1994-03-29 21 \n",
+ "\n",
+ " year_quarter paragraph full_filename \n",
+ "6 1994q1 False edgar/data/100240/0000950144-94-000787.txt \n",
+ "11 1994q1 False edgar/data/100885/0000100885-94-000006.txt "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "md_merged.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "1e11faef-853b-48f2-9eb0-af7f8715cd41",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.10292571287189956"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# what percentage of CIKs are only covered by paragraph layout docs\n",
+ "# get the set of unique CIKs in md_merged\n",
+ "all_ciks = set(md_merged.cik)\n",
+ "# remove the paragraph layout docs\n",
+ "no_paragraph_ciks = set(md_merged[md_merged[\"paragraph\"] == False].cik)\n",
+ "# get the set of CIKs that are in the full set but not the paragraph removed set\n",
+ "only_paragraph_ciks = all_ciks - no_paragraph_ciks\n",
+ "# divide that number by the total number of CIKs\n",
+ "len(only_paragraph_ciks)/len(all_ciks)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "6062d722-b1c7-4589-975e-7fe8cef65a40",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1664"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(only_paragraph_ciks)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3b1f6ab8-e3be-48c2-9ecb-346425af3777",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# what percentage of CIK and year-quarter coverage do we get if we exclude all paragraph filings"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "mozilla_sec_eia",
+ "language": "python",
+ "name": "mozilla_sec_eia"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb
new file mode 100644
index 0000000..81f3513
--- /dev/null
+++ b/notebooks/18-kl-splink-sec-eia.ipynb
@@ -0,0 +1,3799 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "9029518c-ea19-4055-a938-36a5ea1804d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%load_ext autoreload\n",
+ "%autoreload 3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "1107fe42-197c-4fea-9c48-06d08699af0b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix\n",
+ "from splink import block_on, DuckDBAPI, Linker, SettingsCreator\n",
+ "from splink.blocking_analysis import count_comparisons_from_blocking_rule, cumulative_comparisons_to_be_scored_from_blocking_rules_chart, n_largest_blocks\n",
+ "import splink.comparison_library as cl\n",
+ "import splink.comparison_level_library as cll\n",
+ "from splink.exploratory import completeness_chart, profile_columns\n",
+ "\n",
+ "from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import (\n",
+ " BLOCKING_RULES,\n",
+ " MATCH_COLS,\n",
+ " SHARED_COLS,\n",
+ " address_comparison,\n",
+ " city_comparison,\n",
+ " company_name_comparison,\n",
+ " deterministic_blocking_rules,\n",
+ " state_comparison\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9b8224d4-7596-45b7-bfb5-028f29a96f3d",
+ "metadata": {},
+ "source": [
+ "# Inputs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fb6b3f3f-8c30-4810-90dd-75cfbeecc4e0",
+ "metadata": {},
+ "source": [
+ "### EIA"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "8b1add80-34d7-44a8-a7b4-181a770bb2cb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "eia_df = pd.read_parquet(\"gs://sec10k-outputs/v2/core_eia__parents_and_subsidiaries.parquet\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "755ab2a3-a32b-4ac1-81a5-0fb3a85dcdb3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "20821"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(eia_df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ae7706d6-3f32-4c99-aeb6-dac6c6529eec",
+ "metadata": {},
+ "source": [
+ "### SEC 10K Basic Info"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "3f5f9e6c-0725-48e1-920f-3d516b4388a6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sec_df = pd.read_pickle(\"/Users/katielamb/CatalystCoop/dagster_home/storage/core_sec_10k__parents_and_subsidiaries\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "63d97f0d-df22-4c27-b3e7-1035166b4011",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "61026"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(sec_df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "381e4a8f-d8da-4802-b86e-1ac4fc3094db",
+ "metadata": {},
+ "source": [
+ "# Preprocess SEC and EIA\n",
+ "\n",
+ "Does it make more sense to do a direct match on company name after\n",
+ "the SEC basic info to EIA match is done? And if there's a conflicting SEC match (one basic info and one Ex. 21) then review it manually?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "7d2d103a-2bbd-4974-b770-44626bdc5111",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sec_match_df = sec_df[sec_df.files_10k][SHARED_COLS]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "eia_match_df = eia_df[SHARED_COLS]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "e754b2ef-5a0d-4582-8694-047528dfd339",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_match_df.record_id.is_unique"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "38ad3504-2cde-455f-8896-6a435677541c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "eia_match_df.record_id.is_unique"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "856c14d8-3250-4650-a2db-3808b4718f19",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Note that sec_company_id isn't unique here because we are keeping each unique company name and address pair\n",
+ "sec_df.sec_company_id.is_unique"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b18fef7e-c316-4c90-b2bc-04706401135e",
+ "metadata": {},
+ "source": [
+ "There should probably be no duplicate record, but if there are, keep the most recent version of that record."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "842fa02e-5202-445c-b728-72bce42e740d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False 20821\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "eia_match_df.duplicated(subset=MATCH_COLS).value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "b53e6244-f0ca-4256-bc09-9c3264675389",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False 61026\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_match_df.duplicated(subset=MATCH_COLS).value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "e4d54448-0c2f-452b-931c-ff79a5cc3669",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sec_match_df = sec_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=MATCH_COLS, keep=\"first\")\n",
+ "eia_match_df = eia_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=MATCH_COLS, keep=\"first\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "46d967d4-3722-437d-b2f0-37cbac17624f",
+ "metadata": {},
+ "source": [
+ "# Link SEC and EIA"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "509988b1-ed2c-41b3-9334-f44ae599cf4f",
+ "metadata": {},
+ "source": [
+ "## Exploratory Analysis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "de894ad0-0b72-4486-b737-c3bfb95f8f05",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "db_api = DuckDBAPI()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "4bab1568-6a55-427c-9a78-e44db8b0584d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.LayerChart(...)"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "completeness_chart(sec_match_df, db_api=db_api)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "6b9479e3-e836-4407-a2b6-926c185065a8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.LayerChart(...)"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "completeness_chart(eia_match_df, db_api=db_api)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "95bd4db7-c447-4a0e-ab37-59d4beac0b11",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.VConcatChart(...)"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "profile_columns(sec_match_df[MATCH_COLS], db_api=DuckDBAPI(), top_n=10, bottom_n=5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "d9cbc91b-b061-461b-b1f2-83036c70d7c7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.VConcatChart(...)"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "profile_columns(eia_match_df[MATCH_COLS], db_api=DuckDBAPI(), top_n=10, bottom_n=5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "69f5fc54-f479-495c-86fc-48accda883d0",
+ "metadata": {},
+ "source": [
+ "## Blocking"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'number_of_comparisons_generated_pre_filter_conditions': 487944,\n",
+ " 'number_of_comparisons_to_be_scored_post_filter_conditions': 487944,\n",
+ " 'filter_conditions_identified': '',\n",
+ " 'equi_join_conditions_identified': 'SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n",
+ " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# useful for experimenting with a new blocking rule\n",
+ "counts = count_comparisons_from_blocking_rule(\n",
+ " table_or_tables=[sec_match_df, eia_match_df],\n",
+ " blocking_rule=BLOCKING_RULES[0],\n",
+ " link_type=\"link_only\",\n",
+ " unique_id_column_name='record_id',\n",
+ " db_api=db_api,\n",
+ ")\n",
+ "\n",
+ "counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "c37c117a-0fa2-4b8a-9fae-da3569895ca3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " key_0 | \n",
+ " count_l | \n",
+ " count_r | \n",
+ " block_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " INTR | \n",
+ " 445 | \n",
+ " 76 | \n",
+ " 33820 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " AMRK | \n",
+ " 851 | \n",
+ " 38 | \n",
+ " 32338 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " FRST | \n",
+ " 816 | \n",
+ " 36 | \n",
+ " 29376 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " key_0 count_l count_r block_count\n",
+ "0 INTR 445 76 33820\n",
+ "1 AMRK 851 38 32338\n",
+ "2 FRST 816 36 29376"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "result = n_largest_blocks(\n",
+ " table_or_tables=[sec_match_df, eia_match_df],\n",
+ " blocking_rule=BLOCKING_RULES[0],\n",
+ " link_type=\"link_only\",\n",
+ " db_api=db_api,\n",
+ " n_largest=3\n",
+ ")\n",
+ "\n",
+ "result.as_pandas_dataframe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "4e1a9844-5d98-4cac-a083-eef134f083ce",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
+ " table_or_tables=[sec_match_df, eia_match_df],\n",
+ " blocking_rules=BLOCKING_RULES,\n",
+ " db_api=db_api,\n",
+ " unique_id_column_name='record_id',\n",
+ " link_type=\"link_only\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "377b0017-e46f-4d06-8cb5-af2b7725fc0e",
+ "metadata": {},
+ "source": [
+ "## Create Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "f72f546c-c338-4c9f-aab1-ba03c3169e18",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Comparison 'NameComparison' of \"company_name_no_legal\".\n",
+ "Similarity is assessed using the following ComparisonLevels:\n",
+ " - 'company_name_no_legal is NULL' with SQL rule: \"company_name_no_legal_l\" IS NULL OR \"company_name_no_legal_r\" IS NULL\n",
+ " - 'Exact match on company_name_no_legal' with SQL rule: \"company_name_no_legal_l\" = \"company_name_no_legal_r\"\n",
+ " - 'Jaro-Winkler distance of company_name_no_legal >= 0.95' with SQL rule: jaro_winkler_similarity(\"company_name_no_legal_l\", \"company_name_no_legal_r\") >= 0.95\n",
+ " - 'All other comparisons' with SQL rule: ELSE\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "4298a288-c306-4d75-9d72-e5b8f87774ce",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Comparison 'LevenshteinAtThresholds' of \"street_address\".\n",
+ "Similarity is assessed using the following ComparisonLevels:\n",
+ " - 'street_address is NULL' with SQL rule: \"street_address_l\" IS NULL OR \"street_address_r\" IS NULL\n",
+ " - 'Exact match on street_address' with SQL rule: \"street_address_l\" = \"street_address_r\"\n",
+ " - 'Levenshtein distance of street_address <= 1' with SQL rule: levenshtein(\"street_address_l\", \"street_address_r\") <= 1\n",
+ " - 'All other comparisons' with SQL rule: ELSE\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(address_comparison.get_comparison(\"duckdb\").human_readable_description)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "afdd5872-bc29-406f-bd0a-d5f4436f6794",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Comparison 'ExactMatch' of \"state\".\n",
+ "Similarity is assessed using the following ComparisonLevels:\n",
+ " - 'state is NULL' with SQL rule: \"state_l\" IS NULL OR \"state_r\" IS NULL\n",
+ " - 'Exact match on state' with SQL rule: \"state_l\" = \"state_r\"\n",
+ " - 'All other comparisons' with SQL rule: ELSE\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(state_comparison.get_comparison(\"duckdb\").human_readable_description)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "90596d17-edb4-4ed1-9306-ea6c33ad00c6",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Comparison 'NameComparison' of \"city\".\n",
+ "Similarity is assessed using the following ComparisonLevels:\n",
+ " - 'city is NULL' with SQL rule: \"city_l\" IS NULL OR \"city_r\" IS NULL\n",
+ " - 'Exact match on city' with SQL rule: \"city_l\" = \"city_r\"\n",
+ " - 'Jaro-Winkler distance of city >= 0.9' with SQL rule: jaro_winkler_similarity(\"city_l\", \"city_r\") >= 0.9\n",
+ " - 'All other comparisons' with SQL rule: ELSE\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(city_comparison.get_comparison(\"duckdb\").human_readable_description)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "a946c820-9b93-41f1-9fc2-6da47d0cf407",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "settings = SettingsCreator(\n",
+ " link_type=\"link_only\",\n",
+ " unique_id_column_name=\"record_id\",\n",
+ " comparisons=[\n",
+ " company_name_comparison,\n",
+ " address_comparison,\n",
+ " state_comparison,\n",
+ " city_comparison\n",
+ " ],\n",
+ " blocking_rules_to_generate_predictions=BLOCKING_RULES,\n",
+ " retain_intermediate_calculation_columns=True,\n",
+ ")\n",
+ "\n",
+ "linker = Linker([sec_match_df, eia_match_df], settings, db_api=DuckDBAPI())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "36cae876-783d-4bff-89df-9d30cc5e60d6",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Probability two random records match is estimated to be 2.37e-06.\n",
+ "This means that amongst all possible pairwise record comparisons, one in 421,176.28 are expected to match. With 1,270,622,346 total possible comparisons, we expect a total of around 3,016.84 matching pairs\n"
+ ]
+ }
+ ],
+ "source": [
+ "linker.training.estimate_probability_two_random_records_match(deterministic_blocking_rules, recall=0.95)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "a5190bda-070d-4d8e-a6db-be7c65b04db3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "----- Estimating u probabilities using random sampling -----\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c4bcd9c2605a413aab003a2484a4a006",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b15bb7a15e37447ba1366278db3ab2bd",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Estimated u probabilities using random sampling\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - company_name_no_legal (no m values are trained).\n",
+ " - street_address (no m values are trained).\n",
+ " - state (no m values are trained).\n",
+ " - city (no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "linker.training.estimate_u_using_random_sampling(max_pairs=1e8)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "f8dd1336-8a5b-49d2-a054-df0c0a0e953f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "(l.\"company_name\" = r.\"company_name\") AND (l.\"company_name\" = r.\"company_name\")\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - company_name_no_legal\n",
+ " - street_address\n",
+ " - state\n",
+ " - city\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ "\n",
+ "WARNING:\n",
+ "Level Jaro-Winkler distance of company_name_no_legal >= 0.95 on comparison company_name_no_legal not observed in dataset, unable to train m value\n",
+ "\n",
+ "WARNING:\n",
+ "Level All other comparisons on comparison company_name_no_legal not observed in dataset, unable to train m value\n",
+ "\n",
+ "Iteration 1: Largest change in params was 0.702 in the m_probability of street_address, level `All other comparisons`\n",
+ "Iteration 2: Largest change in params was 0.283 in probability_two_random_records_match\n",
+ "Iteration 3: Largest change in params was 0.282 in probability_two_random_records_match\n",
+ "Iteration 4: Largest change in params was 0.000535 in probability_two_random_records_match\n",
+ "Iteration 5: Largest change in params was 1.09e-07 in probability_two_random_records_match\n",
+ "\n",
+ "EM converged after 5 iterations\n",
+ "m probability not trained for company_name_no_legal - Jaro-Winkler distance of company_name_no_legal >= 0.95 (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
+ "m probability not trained for company_name_no_legal - All other comparisons (comparison vector value: 0). This usually means the comparison level was never observed in the training data.\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - company_name_no_legal (some m values are not trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "training_blocking_rule = block_on(\"company_name\", \"company_name\")\n",
+ "training_session_fname_sname = (\n",
+ " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "9581aa18-3352-429a-86c4-6078bcf13a55",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "(l.\"street_address\" = r.\"street_address\") AND (l.\"street_address\" = r.\"street_address\")\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - company_name_no_legal\n",
+ " - state\n",
+ " - city\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - street_address\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.967 in the m_probability of company_name_no_legal, level `Exact match on company_name_no_legal`\n",
+ "Iteration 2: Largest change in params was 0.476 in probability_two_random_records_match\n",
+ "Iteration 3: Largest change in params was 0.0397 in probability_two_random_records_match\n",
+ "Iteration 4: Largest change in params was 0.0442 in the m_probability of city, level `All other comparisons`\n",
+ "Iteration 5: Largest change in params was 0.0194 in probability_two_random_records_match\n",
+ "Iteration 6: Largest change in params was 0.00729 in probability_two_random_records_match\n",
+ "Iteration 7: Largest change in params was 0.00274 in probability_two_random_records_match\n",
+ "Iteration 8: Largest change in params was 0.00104 in probability_two_random_records_match\n",
+ "Iteration 9: Largest change in params was 0.000398 in probability_two_random_records_match\n",
+ "Iteration 10: Largest change in params was 0.000153 in probability_two_random_records_match\n",
+ "Iteration 11: Largest change in params was 5.88e-05 in probability_two_random_records_match\n",
+ "\n",
+ "EM converged after 11 iterations\n",
+ "\n",
+ "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
+ ]
+ }
+ ],
+ "source": [
+ "training_blocking_rule = block_on(\"street_address\", \"street_address\")\n",
+ "training_session_fname_sname = (\n",
+ " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "61298aa2-dbd4-4f2a-9c25-5f831d226d13",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.VConcatChart(...)"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.match_weights_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "f365f59e-e4d0-44f3-a1fb-62e0d63d7ba3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.HConcatChart(...)"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.m_u_parameters_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 420,
+ "id": "fedb78e1-ee73-4d1e-8a96-3b27f6561a91",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# you could save the model weights like this\n",
+ "settings = linker.misc.save_model_to_json(\n",
+ " \"model_unsupervised_0.json\", overwrite=True\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "31f9d73d-cfa4-41fa-906f-c8501a29283b",
+ "metadata": {},
+ "source": [
+ "## Make Predictions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "94e96441-89b6-4516-aa6a-4d1593ce03be",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Blocking time: 0.16 seconds\n",
+ "Predict time: 0.26 seconds\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_predictions = linker.inference.predict()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preds_df = df_predictions.as_pandas_dataframe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " source_dataset_l | \n",
+ " source_dataset_r | \n",
+ " record_id_l | \n",
+ " record_id_r | \n",
+ " company_name_no_legal_l | \n",
+ " company_name_no_legal_r | \n",
+ " gamma_company_name_no_legal | \n",
+ " tf_company_name_no_legal_l | \n",
+ " tf_company_name_no_legal_r | \n",
+ " bf_company_name_no_legal | \n",
+ " bf_tf_adj_company_name_no_legal | \n",
+ " street_address_l | \n",
+ " street_address_r | \n",
+ " gamma_street_address | \n",
+ " tf_street_address_l | \n",
+ " tf_street_address_r | \n",
+ " bf_street_address | \n",
+ " bf_tf_adj_street_address | \n",
+ " state_l | \n",
+ " state_r | \n",
+ " gamma_state | \n",
+ " tf_state_l | \n",
+ " tf_state_r | \n",
+ " bf_state | \n",
+ " bf_tf_adj_state | \n",
+ " city_l | \n",
+ " city_r | \n",
+ " gamma_city | \n",
+ " tf_city_l | \n",
+ " tf_city_r | \n",
+ " bf_city | \n",
+ " bf_tf_adj_city | \n",
+ " company_name_mphone_l | \n",
+ " company_name_mphone_r | \n",
+ " match_key | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 295287 | \n",
+ " -22.967975 | \n",
+ " 1.218850e-07 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 56230 | \n",
+ " 19078 | \n",
+ " union pacific | \n",
+ " union electric | \n",
+ " 0 | \n",
+ " 0.000049 | \n",
+ " 0.000098 | \n",
+ " 0.986046 | \n",
+ " 1.000000 | \n",
+ " 1416 dodge st | \n",
+ " mc 1400 | \n",
+ " 0 | \n",
+ " 0.000049 | \n",
+ " 0.000049 | \n",
+ " 0.881658 | \n",
+ " 1.000000 | \n",
+ " ne | \n",
+ " mo | \n",
+ " 0 | \n",
+ " 0.006455 | \n",
+ " 0.010118 | \n",
+ " 0.199012 | \n",
+ " 1.000000 | \n",
+ " omaha | \n",
+ " st louis | \n",
+ " 0 | \n",
+ " 0.003448 | \n",
+ " 0.002764 | \n",
+ " 0.296714 | \n",
+ " 1.000000 | \n",
+ " UNN PSFK | \n",
+ " UNN ELKTRK | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 384509 | \n",
+ " -22.967975 | \n",
+ " 1.218850e-07 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 56484 | \n",
+ " 19138 | \n",
+ " united states lime and minerals | \n",
+ " united water conservation | \n",
+ " 0 | \n",
+ " 0.000037 | \n",
+ " 0.000024 | \n",
+ " 0.986046 | \n",
+ " 1.000000 | \n",
+ " 5429 lbj fwy | \n",
+ " 1701 north lombard st | \n",
+ " 0 | \n",
+ " 0.000024 | \n",
+ " 0.000012 | \n",
+ " 0.881658 | \n",
+ " 1.000000 | \n",
+ " tx | \n",
+ " ca | \n",
+ " 0 | \n",
+ " 0.079841 | \n",
+ " 0.157960 | \n",
+ " 0.199012 | \n",
+ " 1.000000 | \n",
+ " dallas | \n",
+ " oxnard | \n",
+ " 0 | \n",
+ " 0.013855 | \n",
+ " 0.000257 | \n",
+ " 0.296714 | \n",
+ " 1.000000 | \n",
+ " UNTT STTS LM ANT MNRLS | \n",
+ " UNTT WTR KNSRFXN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 384504 | \n",
+ " -22.967975 | \n",
+ " 1.218850e-07 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 56436 | \n",
+ " 19138 | \n",
+ " united rentals | \n",
+ " united water conservation | \n",
+ " 0 | \n",
+ " 0.000024 | \n",
+ " 0.000024 | \n",
+ " 0.986046 | \n",
+ " 1.000000 | \n",
+ " 100 first stamford pl | \n",
+ " 1701 north lombard st | \n",
+ " 0 | \n",
+ " 0.000122 | \n",
+ " 0.000012 | \n",
+ " 0.881658 | \n",
+ " 1.000000 | \n",
+ " ct | \n",
+ " ca | \n",
+ " 0 | \n",
+ " 0.020876 | \n",
+ " 0.157960 | \n",
+ " 0.199012 | \n",
+ " 1.000000 | \n",
+ " stamford | \n",
+ " oxnard | \n",
+ " 0 | \n",
+ " 0.003950 | \n",
+ " 0.000257 | \n",
+ " 0.296714 | \n",
+ " 1.000000 | \n",
+ " UNTT RNTLS | \n",
+ " UNTT WTR KNSRFXN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 384503 | \n",
+ " -22.967975 | \n",
+ " 1.218850e-07 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 56424 | \n",
+ " 19138 | \n",
+ " united parcel service | \n",
+ " united water conservation | \n",
+ " 0 | \n",
+ " 0.000024 | \n",
+ " 0.000024 | \n",
+ " 0.986046 | \n",
+ " 1.000000 | \n",
+ " 55 glenlake pkwy ne | \n",
+ " 1701 north lombard st | \n",
+ " 0 | \n",
+ " 0.000012 | \n",
+ " 0.000012 | \n",
+ " 0.881658 | \n",
+ " 1.000000 | \n",
+ " ga | \n",
+ " ca | \n",
+ " 0 | \n",
+ " 0.018626 | \n",
+ " 0.157960 | \n",
+ " 0.199012 | \n",
+ " 1.000000 | \n",
+ " atlanta | \n",
+ " oxnard | \n",
+ " 0 | \n",
+ " 0.008462 | \n",
+ " 0.000257 | \n",
+ " 0.296714 | \n",
+ " 1.000000 | \n",
+ " UNTT PRSL SRFS | \n",
+ " UNTT WTR KNSRFXN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 384502 | \n",
+ " -22.967975 | \n",
+ " 1.218850e-07 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 56312 | \n",
+ " 19138 | \n",
+ " united bancorp /oh/ | \n",
+ " united water conservation | \n",
+ " 0 | \n",
+ " 0.000024 | \n",
+ " 0.000024 | \n",
+ " 0.986046 | \n",
+ " 1.000000 | \n",
+ " 201 south fourth st | \n",
+ " 1701 north lombard st | \n",
+ " 0 | \n",
+ " 0.000012 | \n",
+ " 0.000012 | \n",
+ " 0.881658 | \n",
+ " 1.000000 | \n",
+ " oh | \n",
+ " ca | \n",
+ " 0 | \n",
+ " 0.016991 | \n",
+ " 0.157960 | \n",
+ " 0.199012 | \n",
+ " 1.000000 | \n",
+ " martins ferry | \n",
+ " oxnard | \n",
+ " 0 | \n",
+ " 0.000024 | \n",
+ " 0.000257 | \n",
+ " 0.296714 | \n",
+ " 1.000000 | \n",
+ " UNTT BNKRP | \n",
+ " UNTT WTR KNSRFXN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 163815 | \n",
+ " 27.519606 | \n",
+ " 1.000000e+00 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 39816 | \n",
+ " 13109 | \n",
+ " northwestern public service | \n",
+ " northwestern public service | \n",
+ " 2 | \n",
+ " 0.000073 | \n",
+ " 0.000073 | \n",
+ " 415263.133269 | \n",
+ " 0.016616 | \n",
+ " 33 third st se | \n",
+ " 33 third st se | \n",
+ " 2 | \n",
+ " 0.000037 | \n",
+ " 0.000037 | \n",
+ " 9605.781694 | \n",
+ " 0.311992 | \n",
+ " sd | \n",
+ " sd | \n",
+ " 1 | \n",
+ " 0.001930 | \n",
+ " 0.001930 | \n",
+ " 15.445559 | \n",
+ " 27.217182 | \n",
+ " huron | \n",
+ " huron | \n",
+ " 2 | \n",
+ " 0.000073 | \n",
+ " 0.000073 | \n",
+ " 102.014123 | \n",
+ " 91.382644 | \n",
+ " NR0WSTRN PBLK SRFS | \n",
+ " NR0WSTRN PBLK SRFS | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 241593 | \n",
+ " 27.526514 | \n",
+ " 1.000000e+00 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 24650 | \n",
+ " 8047 | \n",
+ " green mountain power | \n",
+ " green mountain power | \n",
+ " 2 | \n",
+ " 0.000037 | \n",
+ " 0.000037 | \n",
+ " 415263.133269 | \n",
+ " 0.033231 | \n",
+ " 163 acorn ln | \n",
+ " 163 acorn ln | \n",
+ " 2 | \n",
+ " 0.000037 | \n",
+ " 0.000037 | \n",
+ " 9605.781694 | \n",
+ " 0.311992 | \n",
+ " vt | \n",
+ " vt | \n",
+ " 1 | \n",
+ " 0.001537 | \n",
+ " 0.001537 | \n",
+ " 15.445559 | \n",
+ " 34.184780 | \n",
+ " colchester | \n",
+ " colchester | \n",
+ " 2 | \n",
+ " 0.000183 | \n",
+ " 0.000183 | \n",
+ " 102.014123 | \n",
+ " 36.553058 | \n",
+ " KRN MNTN PWR | \n",
+ " KRN MNTN PWR | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 165487 | \n",
+ " 27.757338 | \n",
+ " 1.000000e+00 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 58842 | \n",
+ " 19906 | \n",
+ " wausau paper mills | \n",
+ " wausau paper mills | \n",
+ " 2 | \n",
+ " 0.000024 | \n",
+ " 0.000024 | \n",
+ " 415263.133269 | \n",
+ " 0.049847 | \n",
+ " one clarks is | \n",
+ " one clarks is | \n",
+ " 2 | \n",
+ " 0.000024 | \n",
+ " 0.000024 | \n",
+ " 9605.781694 | \n",
+ " 0.467987 | \n",
+ " wi | \n",
+ " wi | \n",
+ " 1 | \n",
+ " 0.008840 | \n",
+ " 0.008840 | \n",
+ " 15.445559 | \n",
+ " 5.943112 | \n",
+ " wausau | \n",
+ " wausau | \n",
+ " 2 | \n",
+ " 0.000061 | \n",
+ " 0.000061 | \n",
+ " 102.014123 | \n",
+ " 109.659173 | \n",
+ " WS PPR MLS | \n",
+ " WS PPR MLS | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 340414 | \n",
+ " 27.884365 | \n",
+ " 1.000000e+00 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 51567 | \n",
+ " 17450 | \n",
+ " st joseph light and power | \n",
+ " st joseph light and power | \n",
+ " 2 | \n",
+ " 0.000024 | \n",
+ " 0.000024 | \n",
+ " 415263.133269 | \n",
+ " 0.049847 | \n",
+ " 520 francis st | \n",
+ " 520 francis st | \n",
+ " 2 | \n",
+ " 0.000024 | \n",
+ " 0.000024 | \n",
+ " 9605.781694 | \n",
+ " 0.467987 | \n",
+ " mo | \n",
+ " mo | \n",
+ " 1 | \n",
+ " 0.010118 | \n",
+ " 0.010118 | \n",
+ " 15.445559 | \n",
+ " 5.192099 | \n",
+ " st joseph | \n",
+ " st joseph | \n",
+ " 2 | \n",
+ " 0.000049 | \n",
+ " 0.000049 | \n",
+ " 102.014123 | \n",
+ " 137.073967 | \n",
+ " ST JSF LT ANT PWR | \n",
+ " ST JSF LT ANT PWR | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 274760 | \n",
+ " 29.211012 | \n",
+ " 1.000000e+00 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 20588 | \n",
+ " 6741 | \n",
+ " fibermark | \n",
+ " fibermark | \n",
+ " 2 | \n",
+ " 0.000037 | \n",
+ " 0.000037 | \n",
+ " 415263.133269 | \n",
+ " 0.033231 | \n",
+ " 161 wellington rd | \n",
+ " 161 wellington rd | \n",
+ " 2 | \n",
+ " 0.000024 | \n",
+ " 0.000024 | \n",
+ " 9605.781694 | \n",
+ " 0.467987 | \n",
+ " vt | \n",
+ " vt | \n",
+ " 1 | \n",
+ " 0.001537 | \n",
+ " 0.001537 | \n",
+ " 15.445559 | \n",
+ " 34.184780 | \n",
+ " brattleboro | \n",
+ " brattleboro | \n",
+ " 2 | \n",
+ " 0.000086 | \n",
+ " 0.000086 | \n",
+ " 102.014123 | \n",
+ " 78.327981 | \n",
+ " FBRMRK | \n",
+ " FBRMRK | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
590575 rows × 37 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key\n",
+ "295287 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56230 19078 union pacific union electric 0 0.000049 0.000098 0.986046 1.000000 1416 dodge st mc 1400 0 0.000049 0.000049 0.881658 1.000000 ne mo 0 0.006455 0.010118 0.199012 1.000000 omaha st louis 0 0.003448 0.002764 0.296714 1.000000 UNN PSFK UNN ELKTRK 0\n",
+ "384509 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56484 19138 united states lime and minerals united water conservation 0 0.000037 0.000024 0.986046 1.000000 5429 lbj fwy 1701 north lombard st 0 0.000024 0.000012 0.881658 1.000000 tx ca 0 0.079841 0.157960 0.199012 1.000000 dallas oxnard 0 0.013855 0.000257 0.296714 1.000000 UNTT STTS LM ANT MNRLS UNTT WTR KNSRFXN 0\n",
+ "384504 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56436 19138 united rentals united water conservation 0 0.000024 0.000024 0.986046 1.000000 100 first stamford pl 1701 north lombard st 0 0.000122 0.000012 0.881658 1.000000 ct ca 0 0.020876 0.157960 0.199012 1.000000 stamford oxnard 0 0.003950 0.000257 0.296714 1.000000 UNTT RNTLS UNTT WTR KNSRFXN 0\n",
+ "384503 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56424 19138 united parcel service united water conservation 0 0.000024 0.000024 0.986046 1.000000 55 glenlake pkwy ne 1701 north lombard st 0 0.000012 0.000012 0.881658 1.000000 ga ca 0 0.018626 0.157960 0.199012 1.000000 atlanta oxnard 0 0.008462 0.000257 0.296714 1.000000 UNTT PRSL SRFS UNTT WTR KNSRFXN 0\n",
+ "384502 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56312 19138 united bancorp /oh/ united water conservation 0 0.000024 0.000024 0.986046 1.000000 201 south fourth st 1701 north lombard st 0 0.000012 0.000012 0.881658 1.000000 oh ca 0 0.016991 0.157960 0.199012 1.000000 martins ferry oxnard 0 0.000024 0.000257 0.296714 1.000000 UNTT BNKRP UNTT WTR KNSRFXN 0\n",
+ "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n",
+ "163815 27.519606 1.000000e+00 __splink__input_table_0 __splink__input_table_1 39816 13109 northwestern public service northwestern public service 2 0.000073 0.000073 415263.133269 0.016616 33 third st se 33 third st se 2 0.000037 0.000037 9605.781694 0.311992 sd sd 1 0.001930 0.001930 15.445559 27.217182 huron huron 2 0.000073 0.000073 102.014123 91.382644 NR0WSTRN PBLK SRFS NR0WSTRN PBLK SRFS 0\n",
+ "241593 27.526514 1.000000e+00 __splink__input_table_0 __splink__input_table_1 24650 8047 green mountain power green mountain power 2 0.000037 0.000037 415263.133269 0.033231 163 acorn ln 163 acorn ln 2 0.000037 0.000037 9605.781694 0.311992 vt vt 1 0.001537 0.001537 15.445559 34.184780 colchester colchester 2 0.000183 0.000183 102.014123 36.553058 KRN MNTN PWR KRN MNTN PWR 0\n",
+ "165487 27.757338 1.000000e+00 __splink__input_table_0 __splink__input_table_1 58842 19906 wausau paper mills wausau paper mills 2 0.000024 0.000024 415263.133269 0.049847 one clarks is one clarks is 2 0.000024 0.000024 9605.781694 0.467987 wi wi 1 0.008840 0.008840 15.445559 5.943112 wausau wausau 2 0.000061 0.000061 102.014123 109.659173 WS PPR MLS WS PPR MLS 0\n",
+ "340414 27.884365 1.000000e+00 __splink__input_table_0 __splink__input_table_1 51567 17450 st joseph light and power st joseph light and power 2 0.000024 0.000024 415263.133269 0.049847 520 francis st 520 francis st 2 0.000024 0.000024 9605.781694 0.467987 mo mo 1 0.010118 0.010118 15.445559 5.192099 st joseph st joseph 2 0.000049 0.000049 102.014123 137.073967 ST JSF LT ANT PWR ST JSF LT ANT PWR 0\n",
+ "274760 29.211012 1.000000e+00 __splink__input_table_0 __splink__input_table_1 20588 6741 fibermark fibermark 2 0.000037 0.000037 415263.133269 0.033231 161 wellington rd 161 wellington rd 2 0.000024 0.000024 9605.781694 0.467987 vt vt 1 0.001537 0.001537 15.445559 34.184780 brattleboro brattleboro 2 0.000086 0.000086 102.014123 78.327981 FBRMRK FBRMRK 0\n",
+ "\n",
+ "[590575 rows x 37 columns]"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preds_df.sort_values(by=\"match_probability\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "c0b292c8-26ed-407a-866e-75851577d567",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# join on utility_id_eia and CIK\n",
+ "preds_validation_df = preds_df.merge(sec_df[[\"record_id\", \"sec_company_id\", \"central_index_key\", \"company_name_raw\"]],\n",
+ " how=\"left\",\n",
+ " left_on=\"record_id_l\",\n",
+ " right_on=\"record_id\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "e8744d54-3cc6-434c-bbbe-a10d2d3cd9c0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preds_validation_df = preds_validation_df.merge(eia_df[[\"record_id\", \"utility_id_eia\"]],\n",
+ " how=\"left\",\n",
+ " left_on=\"record_id_r\",\n",
+ " right_on=\"record_id\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "5103190c-3775-427f-a8f2-cc8a8f79892b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preds_validation_df = preds_validation_df.sort_values(\n",
+ " by=[\"sec_company_id\", \"utility_id_eia\", \"match_probability\"], ascending=False\n",
+ ").drop_duplicates(subset=[\"sec_company_id\", \"utility_id_eia\"], keep=\"first\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "8fa04f18-beff-4bdc-bbbb-705950eab5b8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " source_dataset_l | \n",
+ " source_dataset_r | \n",
+ " record_id_l | \n",
+ " record_id_r | \n",
+ " company_name_no_legal_l | \n",
+ " company_name_no_legal_r | \n",
+ " gamma_company_name_no_legal | \n",
+ " tf_company_name_no_legal_l | \n",
+ " tf_company_name_no_legal_r | \n",
+ " bf_company_name_no_legal | \n",
+ " bf_tf_adj_company_name_no_legal | \n",
+ " street_address_l | \n",
+ " street_address_r | \n",
+ " gamma_street_address | \n",
+ " tf_street_address_l | \n",
+ " tf_street_address_r | \n",
+ " bf_street_address | \n",
+ " bf_tf_adj_street_address | \n",
+ " state_l | \n",
+ " state_r | \n",
+ " gamma_state | \n",
+ " tf_state_l | \n",
+ " tf_state_r | \n",
+ " bf_state | \n",
+ " bf_tf_adj_state | \n",
+ " city_l | \n",
+ " city_r | \n",
+ " gamma_city | \n",
+ " tf_city_l | \n",
+ " tf_city_r | \n",
+ " bf_city | \n",
+ " bf_tf_adj_city | \n",
+ " company_name_mphone_l | \n",
+ " company_name_mphone_r | \n",
+ " match_key | \n",
+ " record_id_x | \n",
+ " sec_company_id | \n",
+ " central_index_key | \n",
+ " company_name_raw | \n",
+ " record_id_y | \n",
+ " utility_id_eia | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 218797 | \n",
+ " 3.824578 | \n",
+ " 0.934072 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 14692 | \n",
+ " 6293 | \n",
+ " crane | \n",
+ " entergy nuclear power marketing | \n",
+ " 0 | \n",
+ " 0.000012 | \n",
+ " 0.000012 | \n",
+ " 0.986046 | \n",
+ " 1.0 | \n",
+ " 100 first stamford pl | \n",
+ " 100 first stamford pl | \n",
+ " 2 | \n",
+ " 0.000122 | \n",
+ " 0.000122 | \n",
+ " 9605.781694 | \n",
+ " 0.093597 | \n",
+ " ct | \n",
+ " ct | \n",
+ " 1 | \n",
+ " 0.020876 | \n",
+ " 0.020876 | \n",
+ " 15.445559 | \n",
+ " 2.516547 | \n",
+ " stamford | \n",
+ " stamford | \n",
+ " 2 | \n",
+ " 0.003950 | \n",
+ " 0.003950 | \n",
+ " 102.014123 | \n",
+ " 1.697510 | \n",
+ " KRN | \n",
+ " ENTRJ NKLR PWR MRKTNK | \n",
+ " 1 | \n",
+ " 14692 | \n",
+ " 0001944013 | \n",
+ " 0001944013 | \n",
+ " crane co | \n",
+ " 6293 | \n",
+ " 55243 | \n",
+ "
\n",
+ " \n",
+ " 220036 | \n",
+ " 4.619987 | \n",
+ " 0.960922 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 17752 | \n",
+ " 5535 | \n",
+ " dte electric securitization funding i | \n",
+ " dte sustainable generation | \n",
+ " 0 | \n",
+ " 0.000012 | \n",
+ " 0.000012 | \n",
+ " 0.986046 | \n",
+ " 1.0 | \n",
+ " one energy plz | \n",
+ " one energy plz | \n",
+ " 2 | \n",
+ " 0.000330 | \n",
+ " 0.000330 | \n",
+ " 9605.781694 | \n",
+ " 0.034666 | \n",
+ " mi | \n",
+ " mi | \n",
+ " 1 | \n",
+ " 0.015147 | \n",
+ " 0.015147 | \n",
+ " 15.445559 | \n",
+ " 3.468423 | \n",
+ " detroit | \n",
+ " detroit | \n",
+ " 2 | \n",
+ " 0.001162 | \n",
+ " 0.001162 | \n",
+ " 102.014123 | \n",
+ " 5.771535 | \n",
+ " TT ELKTRK SKRTSXN FNTNK I | \n",
+ " TT SSTNBL JNRXN | \n",
+ " 1 | \n",
+ " 17752 | \n",
+ " 0001876068 | \n",
+ " 0001876068 | \n",
+ " dte electric securitization funding i llc | \n",
+ " 5535 | \n",
+ " 64331 | \n",
+ "
\n",
+ " \n",
+ " 358152 | \n",
+ " 4.619987 | \n",
+ " 0.960922 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 17752 | \n",
+ " 5522 | \n",
+ " dte electric securitization funding i | \n",
+ " dte electric | \n",
+ " 0 | \n",
+ " 0.000012 | \n",
+ " 0.000037 | \n",
+ " 0.986046 | \n",
+ " 1.0 | \n",
+ " one energy plz | \n",
+ " one energy plz | \n",
+ " 2 | \n",
+ " 0.000330 | \n",
+ " 0.000330 | \n",
+ " 9605.781694 | \n",
+ " 0.034666 | \n",
+ " mi | \n",
+ " mi | \n",
+ " 1 | \n",
+ " 0.015147 | \n",
+ " 0.015147 | \n",
+ " 15.445559 | \n",
+ " 3.468423 | \n",
+ " detroit | \n",
+ " detroit | \n",
+ " 2 | \n",
+ " 0.001162 | \n",
+ " 0.001162 | \n",
+ " 102.014123 | \n",
+ " 5.771535 | \n",
+ " TT ELKTRK SKRTSXN FNTNK I | \n",
+ " TT ELKTRK | \n",
+ " 0 | \n",
+ " 17752 | \n",
+ " 0001876068 | \n",
+ " 0001876068 | \n",
+ " dte electric securitization funding i llc | \n",
+ " 5522 | \n",
+ " 5109 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x sec_company_id central_index_key company_name_raw record_id_y utility_id_eia\n",
+ "218797 3.824578 0.934072 __splink__input_table_0 __splink__input_table_1 14692 6293 crane entergy nuclear power marketing 0 0.000012 0.000012 0.986046 1.0 100 first stamford pl 100 first stamford pl 2 0.000122 0.000122 9605.781694 0.093597 ct ct 1 0.020876 0.020876 15.445559 2.516547 stamford stamford 2 0.003950 0.003950 102.014123 1.697510 KRN ENTRJ NKLR PWR MRKTNK 1 14692 0001944013 0001944013 crane co 6293 55243\n",
+ "220036 4.619987 0.960922 __splink__input_table_0 __splink__input_table_1 17752 5535 dte electric securitization funding i dte sustainable generation 0 0.000012 0.000012 0.986046 1.0 one energy plz one energy plz 2 0.000330 0.000330 9605.781694 0.034666 mi mi 1 0.015147 0.015147 15.445559 3.468423 detroit detroit 2 0.001162 0.001162 102.014123 5.771535 TT ELKTRK SKRTSXN FNTNK I TT SSTNBL JNRXN 1 17752 0001876068 0001876068 dte electric securitization funding i llc 5535 64331\n",
+ "358152 4.619987 0.960922 __splink__input_table_0 __splink__input_table_1 17752 5522 dte electric securitization funding i dte electric 0 0.000012 0.000037 0.986046 1.0 one energy plz one energy plz 2 0.000330 0.000330 9605.781694 0.034666 mi mi 1 0.015147 0.015147 15.445559 3.468423 detroit detroit 2 0.001162 0.001162 102.014123 5.771535 TT ELKTRK SKRTSXN FNTNK I TT ELKTRK 0 17752 0001876068 0001876068 dte electric securitization funding i llc 5522 5109"
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preds_validation_df[preds_validation_df.match_probability > .9].head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "07fbec17-cef2-4b9c-a005-1623c65c5e20",
+ "metadata": {},
+ "source": [
+ "Figure out what to do about this validation CSV, maybe it should be part of package data? It's not a very big sample size and it's imperfect so the metrics gained from it are should be taken with a grain of salt."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "11190456-12a9-49df-b863-7a6f674e39eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_df = pd.read_csv(\"sec_eia_validation_set.csv\", dtype={\"central_index_key\": str})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "5a57bae5-6188-4a6c-be9a-14a159f82d81",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_df[\"central_index_key\"] = validation_df[\"central_index_key\"].str.zfill(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "461725d0-28c8-43dd-bcd5-086b7d3f7d4b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "merged_df = validation_df.merge(\n",
+ " preds_validation_df[[\"record_id_l\", \"record_id_r\", \"central_index_key\", \"utility_id_eia\", \"match_probability\", \"gamma_company_name_no_legal\"]].drop_duplicates(keep=\"first\"),\n",
+ " how=\"left\",\n",
+ " on=[\"central_index_key\", \"utility_id_eia\"],\n",
+ " indicator=True\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "4d45f339-7a5b-466a-81f5-c71e425a77df",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "merged_df[\"predicted_match\"] = merged_df[\"_merge\"].map({\"both\": 1, \"left_only\": 0})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "182732ea-39c5-4fb4-9429-5f7aed781ef5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "merged_df[\"predicted_match\"] = merged_df[\"predicted_match\"].where(\n",
+ " (merged_df.match_probability > .95),\n",
+ " 0\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "bc058dbf-6f71-4978-90ce-5405edbbf9e5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " central_index_key | \n",
+ " utility_id_eia | \n",
+ " sec_company_name | \n",
+ " eia_company_name | \n",
+ " match | \n",
+ " record_id_l | \n",
+ " record_id_r | \n",
+ " match_probability | \n",
+ " gamma_company_name_no_legal | \n",
+ " _merge | \n",
+ " predicted_match | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0000003153 | \n",
+ " 195 | \n",
+ " alabama power co | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 1701.0 | \n",
+ " 478.0 | \n",
+ " 1.000000 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0001868941 | \n",
+ " 58702 | \n",
+ " fluence energy, inc. | \n",
+ " Fluence | \n",
+ " 0 | \n",
+ " 21792.0 | \n",
+ " 6889.0 | \n",
+ " 0.016529 | \n",
+ " 0.0 | \n",
+ " both | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0000041091 | \n",
+ " 7140 | \n",
+ " georgia power co | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 23416.0 | \n",
+ " 7653.0 | \n",
+ " 0.999997 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0000022198 | \n",
+ " 4062 | \n",
+ " columbus southern power co /oh/ | \n",
+ " Columbus Southern Power Co | \n",
+ " 1 | \n",
+ " 13310.0 | \n",
+ " 4281.0 | \n",
+ " 0.999982 | \n",
+ " 1.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0001326160 | \n",
+ " 5416 | \n",
+ " duke energy corp | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 17793.0 | \n",
+ " 5564.0 | \n",
+ " 0.927293 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 0000030371 | \n",
+ " 54905 | \n",
+ " duke energy carolinas, llc | \n",
+ " Duke Energy Carolinas LLC | \n",
+ " 1 | \n",
+ " 17790.0 | \n",
+ " 5558.0 | \n",
+ " 0.999987 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 0000869446 | \n",
+ " 57140 | \n",
+ " berkshire realty co inc /de | \n",
+ " Berkshire Wind Power Cooperative Corp | \n",
+ " 0 | \n",
+ " 7449.0 | \n",
+ " 1712.0 | \n",
+ " 0.001912 | \n",
+ " 0.0 | \n",
+ " both | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 0000092122 | \n",
+ " 18195 | \n",
+ " southern co | \n",
+ " southern co services inc | \n",
+ " 0 | \n",
+ " 50964.0 | \n",
+ " 17068.0 | \n",
+ " 0.007216 | \n",
+ " 0.0 | \n",
+ " both | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 0000092122 | \n",
+ " 17650 | \n",
+ " southern co | \n",
+ " Southern Power Co | \n",
+ " 0 | \n",
+ " 50963.0 | \n",
+ " 17089.0 | \n",
+ " 0.034232 | \n",
+ " 0.0 | \n",
+ " both | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 0000075488 | \n",
+ " 14328 | \n",
+ " pacific gas & electric co | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 41598.0 | \n",
+ " 13933.0 | \n",
+ " 0.999948 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 0001031296 | \n",
+ " 6526 | \n",
+ " firstenergy corp | \n",
+ " FirstEnergy | \n",
+ " 0 | \n",
+ " 21579.0 | \n",
+ " 6776.0 | \n",
+ " 0.999998 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 0001031296 | \n",
+ " 54776 | \n",
+ " firstenergy corp | \n",
+ " FirstEnergy Nuclear Generation Corp | \n",
+ " 0 | \n",
+ " 21579.0 | \n",
+ " 6780.0 | \n",
+ " 0.986542 | \n",
+ " 0.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 0001031296 | \n",
+ " 6458 | \n",
+ " firstenergy corp | \n",
+ " First Energy Services | \n",
+ " 0 | \n",
+ " 21579.0 | \n",
+ " 6763.0 | \n",
+ " 0.085466 | \n",
+ " 0.0 | \n",
+ " both | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 0001031296 | \n",
+ " 32208 | \n",
+ " firstenergy corp | \n",
+ " First Energy Corp | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " left_only | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 0000100122 | \n",
+ " 24211 | \n",
+ " tucson electric power co | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 55725.0 | \n",
+ " 18901.0 | \n",
+ " 1.000000 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 0000096271 | \n",
+ " 18454 | \n",
+ " tampa electric co | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 53604.0 | \n",
+ " 18180.0 | \n",
+ " 0.991059 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 0000715957 | \n",
+ " 5248 | \n",
+ " dominion energy, inc | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 17484.0 | \n",
+ " 5386.0 | \n",
+ " 0.999985 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 0001013871 | \n",
+ " 59883 | \n",
+ " nrg energy, inc | \n",
+ " NRG Energy Gas & Wind Holdings Inc | \n",
+ " 0 | \n",
+ " 40084.0 | \n",
+ " 13240.0 | \n",
+ " 0.300165 | \n",
+ " 0.0 | \n",
+ " both | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 0001013871 | \n",
+ " 13377 | \n",
+ " nrg energy inc | \n",
+ " NRG Energy Inc | \n",
+ " 1 | \n",
+ " 40084.0 | \n",
+ " 13243.0 | \n",
+ " 0.999813 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 0000788816 | \n",
+ " 13994 | \n",
+ " oglethorpe power corp | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 40576.0 | \n",
+ " 13515.0 | \n",
+ " 1.000000 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 0000018675 | \n",
+ " 3266 | \n",
+ " central maine power co | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 10876.0 | \n",
+ " 3424.0 | \n",
+ " 1.000000 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " 0001032208 | \n",
+ " 61296 | \n",
+ " sempra energy | \n",
+ " Sempra Generation | \n",
+ " 1 | \n",
+ " 49303.0 | \n",
+ " 16270.0 | \n",
+ " 0.559071 | \n",
+ " 0.0 | \n",
+ " both | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 0000004904 | \n",
+ " 488 | \n",
+ " american electric power co inc | \n",
+ " American Electric Power Inc | \n",
+ " 1 | \n",
+ " 2927.0 | \n",
+ " 793.0 | \n",
+ " 0.996076 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " 0000715957 | \n",
+ " 5248 | \n",
+ " dominion energy, inc | \n",
+ " Dominion Energy Inc. | \n",
+ " 1 | \n",
+ " 17484.0 | \n",
+ " 5386.0 | \n",
+ " 0.999985 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " central_index_key utility_id_eia sec_company_name eia_company_name match record_id_l record_id_r match_probability gamma_company_name_no_legal _merge predicted_match\n",
+ "0 0000003153 195 alabama power co NaN 1 1701.0 478.0 1.000000 2.0 both 1.0\n",
+ "1 0001868941 58702 fluence energy, inc. Fluence 0 21792.0 6889.0 0.016529 0.0 both 0.0\n",
+ "2 0000041091 7140 georgia power co NaN 1 23416.0 7653.0 0.999997 2.0 both 1.0\n",
+ "3 0000022198 4062 columbus southern power co /oh/ Columbus Southern Power Co 1 13310.0 4281.0 0.999982 1.0 both 1.0\n",
+ "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927293 2.0 both 0.0\n",
+ "5 0000030371 54905 duke energy carolinas, llc Duke Energy Carolinas LLC 1 17790.0 5558.0 0.999987 2.0 both 1.0\n",
+ "6 0000869446 57140 berkshire realty co inc /de Berkshire Wind Power Cooperative Corp 0 7449.0 1712.0 0.001912 0.0 both 0.0\n",
+ "7 0000092122 18195 southern co southern co services inc 0 50964.0 17068.0 0.007216 0.0 both 0.0\n",
+ "8 0000092122 17650 southern co Southern Power Co 0 50963.0 17089.0 0.034232 0.0 both 0.0\n",
+ "9 0000075488 14328 pacific gas & electric co NaN 1 41598.0 13933.0 0.999948 2.0 both 1.0\n",
+ "10 0001031296 6526 firstenergy corp FirstEnergy 0 21579.0 6776.0 0.999998 2.0 both 1.0\n",
+ "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986542 0.0 both 1.0\n",
+ "12 0001031296 6458 firstenergy corp First Energy Services 0 21579.0 6763.0 0.085466 0.0 both 0.0\n",
+ "13 0001031296 32208 firstenergy corp First Energy Corp 1 NaN NaN NaN NaN left_only 0.0\n",
+ "14 0000100122 24211 tucson electric power co NaN 1 55725.0 18901.0 1.000000 2.0 both 1.0\n",
+ "15 0000096271 18454 tampa electric co NaN 1 53604.0 18180.0 0.991059 2.0 both 1.0\n",
+ "16 0000715957 5248 dominion energy, inc NaN 1 17484.0 5386.0 0.999985 2.0 both 1.0\n",
+ "17 0001013871 59883 nrg energy, inc NRG Energy Gas & Wind Holdings Inc 0 40084.0 13240.0 0.300165 0.0 both 0.0\n",
+ "18 0001013871 13377 nrg energy inc NRG Energy Inc 1 40084.0 13243.0 0.999813 2.0 both 1.0\n",
+ "19 0000788816 13994 oglethorpe power corp NaN 1 40576.0 13515.0 1.000000 2.0 both 1.0\n",
+ "20 0000018675 3266 central maine power co NaN 1 10876.0 3424.0 1.000000 2.0 both 1.0\n",
+ "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559071 0.0 both 0.0\n",
+ "22 0000004904 488 american electric power co inc American Electric Power Inc 1 2927.0 793.0 0.996076 2.0 both 1.0\n",
+ "23 0000715957 5248 dominion energy, inc Dominion Energy Inc. 1 17484.0 5386.0 0.999985 2.0 both 1.0"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_df.head(50)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "2fc6ed05-5b35-4856-a668-f16e253e7fea",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "precision = precision_score(merged_df['match'], merged_df['predicted_match'])\n",
+ "recall = recall_score(merged_df['match'], merged_df['predicted_match'])\n",
+ "accuracy = accuracy_score(merged_df['match'], merged_df['predicted_match'])\n",
+ "# roc_auc = roc_auc_score(merged_df['match'], merged_df['match_probability'])\n",
+ "\n",
+ "# Confusion matrix\n",
+ "conf_matrix = confusion_matrix(merged_df['match'], merged_df['predicted_match'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "id": "99a7bc64-9f32-4f53-9a89-63a31ff7debe",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(np.float64(0.8666666666666667), np.float64(0.8125), 0.7916666666666666)"
+ ]
+ },
+ "execution_count": 54,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "precision, recall, accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "08932be5-b90c-440d-9efb-156cb4d63c93",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Predicted Negative | \n",
+ " Predicted Positive | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Negative | \n",
+ " 6 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " Positive | \n",
+ " 3 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Predicted Negative Predicted Positive\n",
+ "Negative 6 2\n",
+ "Positive 3 13"
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame(\n",
+ " conf_matrix,\n",
+ " index=[\"Negative\", \"Positive\"],\n",
+ " columns=[\"Predicted Negative\", \"Predicted Positive\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "025c80e9-5055-4eaa-a873-38b910cd7f94",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "incorrect_df = merged_df[merged_df.match != merged_df.predicted_match]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "e4f44fda-1b55-4f9c-a96d-5eec36dac768",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " central_index_key | \n",
+ " utility_id_eia | \n",
+ " sec_company_name | \n",
+ " eia_company_name | \n",
+ " match | \n",
+ " record_id_l | \n",
+ " record_id_r | \n",
+ " match_probability | \n",
+ " gamma_company_name_no_legal | \n",
+ " _merge | \n",
+ " predicted_match | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 4 | \n",
+ " 0001326160 | \n",
+ " 5416 | \n",
+ " duke energy corp | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 17793.0 | \n",
+ " 5564.0 | \n",
+ " 0.927293 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 0001031296 | \n",
+ " 6526 | \n",
+ " firstenergy corp | \n",
+ " FirstEnergy | \n",
+ " 0 | \n",
+ " 21579.0 | \n",
+ " 6776.0 | \n",
+ " 0.999998 | \n",
+ " 2.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 0001031296 | \n",
+ " 54776 | \n",
+ " firstenergy corp | \n",
+ " FirstEnergy Nuclear Generation Corp | \n",
+ " 0 | \n",
+ " 21579.0 | \n",
+ " 6780.0 | \n",
+ " 0.986542 | \n",
+ " 0.0 | \n",
+ " both | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 0001031296 | \n",
+ " 32208 | \n",
+ " firstenergy corp | \n",
+ " First Energy Corp | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " left_only | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " 0001032208 | \n",
+ " 61296 | \n",
+ " sempra energy | \n",
+ " Sempra Generation | \n",
+ " 1 | \n",
+ " 49303.0 | \n",
+ " 16270.0 | \n",
+ " 0.559071 | \n",
+ " 0.0 | \n",
+ " both | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " central_index_key utility_id_eia sec_company_name eia_company_name match record_id_l record_id_r match_probability gamma_company_name_no_legal _merge predicted_match\n",
+ "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927293 2.0 both 0.0\n",
+ "10 0001031296 6526 firstenergy corp FirstEnergy 0 21579.0 6776.0 0.999998 2.0 both 1.0\n",
+ "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986542 0.0 both 1.0\n",
+ "13 0001031296 32208 firstenergy corp First Energy Corp 1 NaN NaN NaN NaN left_only 0.0\n",
+ "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559071 0.0 both 0.0"
+ ]
+ },
+ "execution_count": 57,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "incorrect_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 151,
+ "id": "c425a676-aa6e-4d8f-b814-931da392c2ff",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "recs_to_view = []\n",
+ "for idx, rec in incorrect_df.iterrows():\n",
+ " full_rec = preds_validation_df[\n",
+ " (preds_validation_df.record_id_l == rec.record_id_l) & \n",
+ " (preds_validation_df.record_id_r == rec.record_id_r)\n",
+ " ].squeeze()\n",
+ " if full_rec.empty:\n",
+ " continue\n",
+ " recs_to_view.append(full_rec.to_dict())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 152,
+ "id": "ff55f2cb-7ce1-4697-99e7-bf22918f7ed1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.LayerChart(...)"
+ ]
+ },
+ "execution_count": 152,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.waterfall_chart(recs_to_view, filter_nulls=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a2ba43b6-a664-462a-823f-e3f08585bb51",
+ "metadata": {},
+ "source": [
+ "# Save good predictions\n",
+ "Make the predictions one to one. First, keep the highest probability EIA utility ID for each SEC company. Then, keep the highest probability SEC company for each EIA utility"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "id": "92172e2f-39ba-49e3-8312-98597256ca4f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "one_to_one_preds = preds_validation_df[preds_validation_df.match_probability >= .95].sort_values(\n",
+ " by=\"match_probability\", ascending=False\n",
+ ").drop_duplicates(\n",
+ " subset=\"sec_company_id\", keep=\"first\"\n",
+ ").drop_duplicates(\n",
+ " subset=\"utility_id_eia\", keep=\"first\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "07ca81ae-1b26-4cd3-ade6-75381028028a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "534"
+ ]
+ },
+ "execution_count": 59,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(one_to_one_preds)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c3db3175-7cf3-497c-8f22-e68a6c9c6af2",
+ "metadata": {},
+ "source": [
+ "# Add `utility_id_eia` onto the SEC table to create output table"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "361b3e30-e823-4137-9062-6a00eae537fe",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " source_dataset_l | \n",
+ " source_dataset_r | \n",
+ " record_id_l | \n",
+ " record_id_r | \n",
+ " company_name_no_legal_l | \n",
+ " company_name_no_legal_r | \n",
+ " gamma_company_name_no_legal | \n",
+ " tf_company_name_no_legal_l | \n",
+ " tf_company_name_no_legal_r | \n",
+ " bf_company_name_no_legal | \n",
+ " bf_tf_adj_company_name_no_legal | \n",
+ " street_address_l | \n",
+ " street_address_r | \n",
+ " gamma_street_address | \n",
+ " tf_street_address_l | \n",
+ " tf_street_address_r | \n",
+ " bf_street_address | \n",
+ " bf_tf_adj_street_address | \n",
+ " state_l | \n",
+ " state_r | \n",
+ " gamma_state | \n",
+ " tf_state_l | \n",
+ " tf_state_r | \n",
+ " bf_state | \n",
+ " bf_tf_adj_state | \n",
+ " city_l | \n",
+ " city_r | \n",
+ " gamma_city | \n",
+ " tf_city_l | \n",
+ " tf_city_r | \n",
+ " bf_city | \n",
+ " bf_tf_adj_city | \n",
+ " company_name_mphone_l | \n",
+ " company_name_mphone_r | \n",
+ " match_key | \n",
+ " record_id_x | \n",
+ " sec_company_id | \n",
+ " central_index_key | \n",
+ " company_name_raw | \n",
+ " record_id_y | \n",
+ " utility_id_eia | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 274760 | \n",
+ " 29.211012 | \n",
+ " 1.000000 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 20588 | \n",
+ " 6741 | \n",
+ " fibermark | \n",
+ " fibermark | \n",
+ " 2 | \n",
+ " 0.000037 | \n",
+ " 0.000037 | \n",
+ " 415263.133269 | \n",
+ " 0.033231 | \n",
+ " 161 wellington rd | \n",
+ " 161 wellington rd | \n",
+ " 2 | \n",
+ " 0.000024 | \n",
+ " 0.000024 | \n",
+ " 9605.781694 | \n",
+ " 0.467987 | \n",
+ " vt | \n",
+ " vt | \n",
+ " 1 | \n",
+ " 0.001537 | \n",
+ " 0.001537 | \n",
+ " 15.445559 | \n",
+ " 34.184780 | \n",
+ " brattleboro | \n",
+ " brattleboro | \n",
+ " 2 | \n",
+ " 0.000086 | \n",
+ " 0.000086 | \n",
+ " 102.014123 | \n",
+ " 78.327981 | \n",
+ " FBRMRK | \n",
+ " FBRMRK | \n",
+ " 0 | \n",
+ " 20588 | \n",
+ " 0000887591 | \n",
+ " 0000887591 | \n",
+ " fibermark inc | \n",
+ " 6741 | \n",
+ " 6309 | \n",
+ "
\n",
+ " \n",
+ " 340414 | \n",
+ " 27.884365 | \n",
+ " 1.000000 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 51567 | \n",
+ " 17450 | \n",
+ " st joseph light and power | \n",
+ " st joseph light and power | \n",
+ " 2 | \n",
+ " 0.000024 | \n",
+ " 0.000024 | \n",
+ " 415263.133269 | \n",
+ " 0.049847 | \n",
+ " 520 francis st | \n",
+ " 520 francis st | \n",
+ " 2 | \n",
+ " 0.000024 | \n",
+ " 0.000024 | \n",
+ " 9605.781694 | \n",
+ " 0.467987 | \n",
+ " mo | \n",
+ " mo | \n",
+ " 1 | \n",
+ " 0.010118 | \n",
+ " 0.010118 | \n",
+ " 15.445559 | \n",
+ " 5.192099 | \n",
+ " st joseph | \n",
+ " st joseph | \n",
+ " 2 | \n",
+ " 0.000049 | \n",
+ " 0.000049 | \n",
+ " 102.014123 | \n",
+ " 137.073967 | \n",
+ " ST JSF LT ANT PWR | \n",
+ " ST JSF LT ANT PWR | \n",
+ " 0 | \n",
+ " 51567 | \n",
+ " 0000086251 | \n",
+ " 0000086251 | \n",
+ " st joseph light & power co | \n",
+ " 17450 | \n",
+ " 17881 | \n",
+ "
\n",
+ " \n",
+ " 165487 | \n",
+ " 27.757338 | \n",
+ " 1.000000 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 58842 | \n",
+ " 19906 | \n",
+ " wausau paper mills | \n",
+ " wausau paper mills | \n",
+ " 2 | \n",
+ " 0.000024 | \n",
+ " 0.000024 | \n",
+ " 415263.133269 | \n",
+ " 0.049847 | \n",
+ " one clarks is | \n",
+ " one clarks is | \n",
+ " 2 | \n",
+ " 0.000024 | \n",
+ " 0.000024 | \n",
+ " 9605.781694 | \n",
+ " 0.467987 | \n",
+ " wi | \n",
+ " wi | \n",
+ " 1 | \n",
+ " 0.008840 | \n",
+ " 0.008840 | \n",
+ " 15.445559 | \n",
+ " 5.943112 | \n",
+ " wausau | \n",
+ " wausau | \n",
+ " 2 | \n",
+ " 0.000061 | \n",
+ " 0.000061 | \n",
+ " 102.014123 | \n",
+ " 109.659173 | \n",
+ " WS PPR MLS | \n",
+ " WS PPR MLS | \n",
+ " 0 | \n",
+ " 58842 | \n",
+ " 0000105076 | \n",
+ " 0000105076 | \n",
+ " wausau paper mills co | \n",
+ " 19906 | \n",
+ " 20190 | \n",
+ "
\n",
+ " \n",
+ " 241593 | \n",
+ " 27.526514 | \n",
+ " 1.000000 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 24650 | \n",
+ " 8047 | \n",
+ " green mountain power | \n",
+ " green mountain power | \n",
+ " 2 | \n",
+ " 0.000037 | \n",
+ " 0.000037 | \n",
+ " 415263.133269 | \n",
+ " 0.033231 | \n",
+ " 163 acorn ln | \n",
+ " 163 acorn ln | \n",
+ " 2 | \n",
+ " 0.000037 | \n",
+ " 0.000037 | \n",
+ " 9605.781694 | \n",
+ " 0.311992 | \n",
+ " vt | \n",
+ " vt | \n",
+ " 1 | \n",
+ " 0.001537 | \n",
+ " 0.001537 | \n",
+ " 15.445559 | \n",
+ " 34.184780 | \n",
+ " colchester | \n",
+ " colchester | \n",
+ " 2 | \n",
+ " 0.000183 | \n",
+ " 0.000183 | \n",
+ " 102.014123 | \n",
+ " 36.553058 | \n",
+ " KRN MNTN PWR | \n",
+ " KRN MNTN PWR | \n",
+ " 0 | \n",
+ " 24650 | \n",
+ " 0000043704 | \n",
+ " 0000043704 | \n",
+ " green mountain power corp | \n",
+ " 8047 | \n",
+ " 7601 | \n",
+ "
\n",
+ " \n",
+ " 163815 | \n",
+ " 27.519606 | \n",
+ " 1.000000 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 39816 | \n",
+ " 13109 | \n",
+ " northwestern public service | \n",
+ " northwestern public service | \n",
+ " 2 | \n",
+ " 0.000073 | \n",
+ " 0.000073 | \n",
+ " 415263.133269 | \n",
+ " 0.016616 | \n",
+ " 33 third st se | \n",
+ " 33 third st se | \n",
+ " 2 | \n",
+ " 0.000037 | \n",
+ " 0.000037 | \n",
+ " 9605.781694 | \n",
+ " 0.311992 | \n",
+ " sd | \n",
+ " sd | \n",
+ " 1 | \n",
+ " 0.001930 | \n",
+ " 0.001930 | \n",
+ " 15.445559 | \n",
+ " 27.217182 | \n",
+ " huron | \n",
+ " huron | \n",
+ " 2 | \n",
+ " 0.000073 | \n",
+ " 0.000073 | \n",
+ " 102.014123 | \n",
+ " 91.382644 | \n",
+ " NR0WSTRN PBLK SRFS | \n",
+ " NR0WSTRN PBLK SRFS | \n",
+ " 0 | \n",
+ " 39816 | \n",
+ " 0000073088 | \n",
+ " 0000073088 | \n",
+ " northwestern public service co | \n",
+ " 13109 | \n",
+ " 13809 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1483 | \n",
+ " 4.337121 | \n",
+ " 0.952856 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 58004 | \n",
+ " 17611 | \n",
+ " vistacare | \n",
+ " stirling energy systems solar three | \n",
+ " 0 | \n",
+ " 0.000024 | \n",
+ " 0.000037 | \n",
+ " 0.986046 | \n",
+ " 1.000000 | \n",
+ " 4800 n scottsdale rd | \n",
+ " 4800 n scottsdale rd | \n",
+ " 2 | \n",
+ " 0.000110 | \n",
+ " 0.000110 | \n",
+ " 9605.781694 | \n",
+ " 0.103997 | \n",
+ " az | \n",
+ " az | \n",
+ " 1 | \n",
+ " 0.012872 | \n",
+ " 0.012872 | \n",
+ " 15.445559 | \n",
+ " 4.081277 | \n",
+ " scottsdale | \n",
+ " scottsdale | \n",
+ " 2 | \n",
+ " 0.004989 | \n",
+ " 0.004989 | \n",
+ " 102.014123 | \n",
+ " 1.343862 | \n",
+ " FSTKR | \n",
+ " STRLNK ENRJ SSTMS SLR 0R | \n",
+ " 1 | \n",
+ " 58004 | \n",
+ " 0000787030 | \n",
+ " 0000787030 | \n",
+ " vistacare, inc. | \n",
+ " 17611 | \n",
+ " 56168 | \n",
+ "
\n",
+ " \n",
+ " 218453 | \n",
+ " 4.272157 | \n",
+ " 0.950792 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 19174 | \n",
+ " 7605 | \n",
+ " enovis | \n",
+ " genon sabine delaware | \n",
+ " 0 | \n",
+ " 0.000012 | \n",
+ " 0.000012 | \n",
+ " 0.986046 | \n",
+ " 1.000000 | \n",
+ " 2711 centerville rd | \n",
+ " 2711 centerville rd | \n",
+ " 2 | \n",
+ " 0.000061 | \n",
+ " 0.000061 | \n",
+ " 9605.781694 | \n",
+ " 0.187195 | \n",
+ " de | \n",
+ " de | \n",
+ " 1 | \n",
+ " 0.011717 | \n",
+ " 0.011717 | \n",
+ " 15.445559 | \n",
+ " 4.483838 | \n",
+ " wilmington | \n",
+ " wilmington | \n",
+ " 2 | \n",
+ " 0.010321 | \n",
+ " 0.010321 | \n",
+ " 102.014123 | \n",
+ " 0.649640 | \n",
+ " ENFS | \n",
+ " JNN SBN TLWR | \n",
+ " 1 | \n",
+ " 19174 | \n",
+ " 0001420800 | \n",
+ " 0001420800 | \n",
+ " enovis corp | \n",
+ " 7605 | \n",
+ " 56922 | \n",
+ "
\n",
+ " \n",
+ " 1055 | \n",
+ " 4.272157 | \n",
+ " 0.950792 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1650 | \n",
+ " 16368 | \n",
+ " aisystems | \n",
+ " shannon wind | \n",
+ " 0 | \n",
+ " 0.000024 | \n",
+ " 0.000024 | \n",
+ " 0.986046 | \n",
+ " 1.000000 | \n",
+ " 2711 centerville rd | \n",
+ " 2711 centerville rd | \n",
+ " 2 | \n",
+ " 0.000061 | \n",
+ " 0.000061 | \n",
+ " 9605.781694 | \n",
+ " 0.187195 | \n",
+ " de | \n",
+ " de | \n",
+ " 1 | \n",
+ " 0.011717 | \n",
+ " 0.011717 | \n",
+ " 15.445559 | \n",
+ " 4.483838 | \n",
+ " wilmington | \n",
+ " wilmington | \n",
+ " 2 | \n",
+ " 0.010321 | \n",
+ " 0.010321 | \n",
+ " 102.014123 | \n",
+ " 0.649640 | \n",
+ " ASSTMS | \n",
+ " XNN WNT | \n",
+ " 1 | \n",
+ " 1650 | \n",
+ " 0001328769 | \n",
+ " 0001328769 | \n",
+ " aisystems, inc. | \n",
+ " 16368 | \n",
+ " 58872 | \n",
+ "
\n",
+ " \n",
+ " 7216 | \n",
+ " 4.272157 | \n",
+ " 0.950792 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 32403 | \n",
+ " 14089 | \n",
+ " lease investment flight trust | \n",
+ " pasadena statutory trust | \n",
+ " 0 | \n",
+ " 0.000012 | \n",
+ " 0.000012 | \n",
+ " 0.986046 | \n",
+ " 1.000000 | \n",
+ " 1100 north market st | \n",
+ " 1100 north market st | \n",
+ " 2 | \n",
+ " 0.000061 | \n",
+ " 0.000061 | \n",
+ " 9605.781694 | \n",
+ " 0.187195 | \n",
+ " de | \n",
+ " de | \n",
+ " 1 | \n",
+ " 0.011717 | \n",
+ " 0.011717 | \n",
+ " 15.445559 | \n",
+ " 4.483838 | \n",
+ " wilmington | \n",
+ " wilmington | \n",
+ " 2 | \n",
+ " 0.010321 | \n",
+ " 0.010321 | \n",
+ " 102.014123 | \n",
+ " 0.649640 | \n",
+ " LS INFSTMNT FLT TRST | \n",
+ " PSTN STTTR TRST | \n",
+ " 1 | \n",
+ " 32403 | \n",
+ " 0001158389 | \n",
+ " 0001158389 | \n",
+ " lease investment flight trust | \n",
+ " 14089 | \n",
+ " 61235 | \n",
+ "
\n",
+ " \n",
+ " 6113 | \n",
+ " 4.272157 | \n",
+ " 0.950792 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " 1626 | \n",
+ " 16195 | \n",
+ " airplanes us trust | \n",
+ " se solar trust v c | \n",
+ " 0 | \n",
+ " 0.000012 | \n",
+ " 0.000012 | \n",
+ " 0.986046 | \n",
+ " 1.000000 | \n",
+ " 1100 north market st | \n",
+ " 1100 north market st | \n",
+ " 2 | \n",
+ " 0.000061 | \n",
+ " 0.000061 | \n",
+ " 9605.781694 | \n",
+ " 0.187195 | \n",
+ " de | \n",
+ " de | \n",
+ " 1 | \n",
+ " 0.011717 | \n",
+ " 0.011717 | \n",
+ " 15.445559 | \n",
+ " 4.483838 | \n",
+ " wilmington | \n",
+ " wilmington | \n",
+ " 2 | \n",
+ " 0.010321 | \n",
+ " 0.010321 | \n",
+ " 102.014123 | \n",
+ " 0.649640 | \n",
+ " ARPLNS US TRST | \n",
+ " S SLR TRST F K | \n",
+ " 1 | \n",
+ " 1626 | \n",
+ " 0001004540 | \n",
+ " 0001004540 | \n",
+ " airplanes us trust | \n",
+ " 16195 | \n",
+ " 56900 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
534 rows × 43 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x sec_company_id central_index_key company_name_raw record_id_y utility_id_eia\n",
+ "274760 29.211012 1.000000 __splink__input_table_0 __splink__input_table_1 20588 6741 fibermark fibermark 2 0.000037 0.000037 415263.133269 0.033231 161 wellington rd 161 wellington rd 2 0.000024 0.000024 9605.781694 0.467987 vt vt 1 0.001537 0.001537 15.445559 34.184780 brattleboro brattleboro 2 0.000086 0.000086 102.014123 78.327981 FBRMRK FBRMRK 0 20588 0000887591 0000887591 fibermark inc 6741 6309\n",
+ "340414 27.884365 1.000000 __splink__input_table_0 __splink__input_table_1 51567 17450 st joseph light and power st joseph light and power 2 0.000024 0.000024 415263.133269 0.049847 520 francis st 520 francis st 2 0.000024 0.000024 9605.781694 0.467987 mo mo 1 0.010118 0.010118 15.445559 5.192099 st joseph st joseph 2 0.000049 0.000049 102.014123 137.073967 ST JSF LT ANT PWR ST JSF LT ANT PWR 0 51567 0000086251 0000086251 st joseph light & power co 17450 17881\n",
+ "165487 27.757338 1.000000 __splink__input_table_0 __splink__input_table_1 58842 19906 wausau paper mills wausau paper mills 2 0.000024 0.000024 415263.133269 0.049847 one clarks is one clarks is 2 0.000024 0.000024 9605.781694 0.467987 wi wi 1 0.008840 0.008840 15.445559 5.943112 wausau wausau 2 0.000061 0.000061 102.014123 109.659173 WS PPR MLS WS PPR MLS 0 58842 0000105076 0000105076 wausau paper mills co 19906 20190\n",
+ "241593 27.526514 1.000000 __splink__input_table_0 __splink__input_table_1 24650 8047 green mountain power green mountain power 2 0.000037 0.000037 415263.133269 0.033231 163 acorn ln 163 acorn ln 2 0.000037 0.000037 9605.781694 0.311992 vt vt 1 0.001537 0.001537 15.445559 34.184780 colchester colchester 2 0.000183 0.000183 102.014123 36.553058 KRN MNTN PWR KRN MNTN PWR 0 24650 0000043704 0000043704 green mountain power corp 8047 7601\n",
+ "163815 27.519606 1.000000 __splink__input_table_0 __splink__input_table_1 39816 13109 northwestern public service northwestern public service 2 0.000073 0.000073 415263.133269 0.016616 33 third st se 33 third st se 2 0.000037 0.000037 9605.781694 0.311992 sd sd 1 0.001930 0.001930 15.445559 27.217182 huron huron 2 0.000073 0.000073 102.014123 91.382644 NR0WSTRN PBLK SRFS NR0WSTRN PBLK SRFS 0 39816 0000073088 0000073088 northwestern public service co 13109 13809\n",
+ "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n",
+ "1483 4.337121 0.952856 __splink__input_table_0 __splink__input_table_1 58004 17611 vistacare stirling energy systems solar three 0 0.000024 0.000037 0.986046 1.000000 4800 n scottsdale rd 4800 n scottsdale rd 2 0.000110 0.000110 9605.781694 0.103997 az az 1 0.012872 0.012872 15.445559 4.081277 scottsdale scottsdale 2 0.004989 0.004989 102.014123 1.343862 FSTKR STRLNK ENRJ SSTMS SLR 0R 1 58004 0000787030 0000787030 vistacare, inc. 17611 56168\n",
+ "218453 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 19174 7605 enovis genon sabine delaware 0 0.000012 0.000012 0.986046 1.000000 2711 centerville rd 2711 centerville rd 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 ENFS JNN SBN TLWR 1 19174 0001420800 0001420800 enovis corp 7605 56922\n",
+ "1055 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 1650 16368 aisystems shannon wind 0 0.000024 0.000024 0.986046 1.000000 2711 centerville rd 2711 centerville rd 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 ASSTMS XNN WNT 1 1650 0001328769 0001328769 aisystems, inc. 16368 58872\n",
+ "7216 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 32403 14089 lease investment flight trust pasadena statutory trust 0 0.000012 0.000012 0.986046 1.000000 1100 north market st 1100 north market st 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 LS INFSTMNT FLT TRST PSTN STTTR TRST 1 32403 0001158389 0001158389 lease investment flight trust 14089 61235\n",
+ "6113 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 1626 16195 airplanes us trust se solar trust v c 0 0.000012 0.000012 0.986046 1.000000 1100 north market st 1100 north market st 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 ARPLNS US TRST S SLR TRST F K 1 1626 0001004540 0001004540 airplanes us trust 16195 56900\n",
+ "\n",
+ "[534 rows x 43 columns]"
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "one_to_one_preds"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "id": "1d3e41bd-f92a-4f77-a0a7-0bd24f7ea70c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "out_df = sec_df.merge(\n",
+ " one_to_one_preds[[\"sec_company_id\", \"utility_id_eia\"]],\n",
+ " how=\"left\",\n",
+ " on=\"sec_company_id\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "id": "cce2b383-48b3-4efd-977a-0c734b0e3ec2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "utility_id_eia\n",
+ "True 59895\n",
+ "False 1131\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "out_df.utility_id_eia.isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1cf0be2e-b1ef-4eb1-a07a-28e977c40252",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "len(one_to_one_preds"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "mozilla_sec_eia",
+ "language": "python",
+ "name": "mozilla_sec_eia"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/20-kl-validate-sec-output-table.ipynb b/notebooks/20-kl-validate-sec-output-table.ipynb
new file mode 100644
index 0000000..061a227
--- /dev/null
+++ b/notebooks/20-kl-validate-sec-output-table.ipynb
@@ -0,0 +1,1456 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "d383d1dd-6cdc-45ea-a371-105046c009e2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%load_ext autoreload\n",
+ "%autoreload 3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "3c58ad67-151d-4054-a972-a1e7ee12949f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "from upath import UPath"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "511b2c77-ebd2-43b0-8e45-1d1c76fb321d",
+ "metadata": {},
+ "source": [
+ "### EIA"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "4907820f-2552-4a3b-866a-30c3181af91b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "eia_df = pd.read_parquet(\"gs://sec10k-outputs/v2/core_eia__parents_and_subsidiaries.parquet\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5f488f86-4b34-4a94-985f-588f991ba86b",
+ "metadata": {},
+ "source": [
+ "### Ex. 21"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "c1795acc-8005-4b6d-be4d-27c722b634f1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ex21_df = pd.read_pickle(\"/Users/katielamb/CatalystCoop/dagster_home/storage/transformed_ex21_subsidiary_table\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "291ce873-4971-4e03-985a-65dbdd8b0850",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sec_company_id | \n",
+ " company_name_raw | \n",
+ " location_of_inc | \n",
+ " own_per | \n",
+ " filename | \n",
+ " report_date | \n",
+ " report_year | \n",
+ " company_name | \n",
+ " company_name_no_legal | \n",
+ " company_name_mphone | \n",
+ " parent_company_cik | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0000000020_colormax limited_united kingdom | \n",
+ " colormax limited | \n",
+ " united kingdom | \n",
+ " NaN | \n",
+ " edgar/data/20/0000893220-06-000650.txt | \n",
+ " 2006-03-23 | \n",
+ " 2006 | \n",
+ " colormax limited | \n",
+ " colormax | \n",
+ " KLRMKS | \n",
+ " 0000000020 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0000000020_gundlach equipment corporation_dela... | \n",
+ " gundlach equipment corporation | \n",
+ " delaware | \n",
+ " NaN | \n",
+ " edgar/data/20/0000950123-10-024631.txt | \n",
+ " 2010-03-15 | \n",
+ " 2010 | \n",
+ " gundlach equipment corporation | \n",
+ " gundlach equipment | \n",
+ " KNTLX EKPMNT | \n",
+ " 0000000020 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0000000020_jeffrey rader ab_sweden | \n",
+ " jeffrey rader ab | \n",
+ " sweden | \n",
+ " NaN | \n",
+ " edgar/data/20/0000950123-10-024631.txt | \n",
+ " 2010-03-15 | \n",
+ " 2010 | \n",
+ " jeffrey rader ab | \n",
+ " jeffrey rader ab | \n",
+ " JFR RTR AB | \n",
+ " 0000000020 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0000000020_jeffrey rader canada company_canada | \n",
+ " jeffrey rader canada company | \n",
+ " canada | \n",
+ " NaN | \n",
+ " edgar/data/20/0000950123-10-024631.txt | \n",
+ " 2010-03-15 | \n",
+ " 2010 | \n",
+ " jeffrey rader canada company | \n",
+ " jeffrey rader canada | \n",
+ " JFR RTR KNT | \n",
+ " 0000000020 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0000000020_jeffrey rader corporation_delaware | \n",
+ " jeffrey rader corporation | \n",
+ " delaware | \n",
+ " NaN | \n",
+ " edgar/data/20/0000950123-10-024631.txt | \n",
+ " 2010-03-15 | \n",
+ " 2010 | \n",
+ " jeffrey rader corporation | \n",
+ " jeffrey rader | \n",
+ " JFR RTR | \n",
+ " 0000000020 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1055982 | \n",
+ " 0001967649_vestis supply chain limited liabili... | \n",
+ " vestis (supply chain), llc | \n",
+ " delaware | \n",
+ " NaN | \n",
+ " edgar/data/1967649/0001967649-23-000025.txt | \n",
+ " 2023-12-21 | \n",
+ " 2023 | \n",
+ " vestis supply chain limited liability company | \n",
+ " vestis supply chain | \n",
+ " FSTS SPL XN | \n",
+ " 0001967649 | \n",
+ "
\n",
+ " \n",
+ " 1055983 | \n",
+ " 0001967649_vestis syracuse limited liability c... | \n",
+ " vestis (syracuse), llc | \n",
+ " delaware | \n",
+ " NaN | \n",
+ " edgar/data/1967649/0001967649-23-000025.txt | \n",
+ " 2023-12-21 | \n",
+ " 2023 | \n",
+ " vestis syracuse limited liability company | \n",
+ " vestis syracuse | \n",
+ " FSTS SRKS | \n",
+ " 0001967649 | \n",
+ "
\n",
+ " \n",
+ " 1055984 | \n",
+ " 0001967649_vestis texas limited liability comp... | \n",
+ " vestis (texas), llc | \n",
+ " delaware | \n",
+ " NaN | \n",
+ " edgar/data/1967649/0001967649-23-000025.txt | \n",
+ " 2023-12-21 | \n",
+ " 2023 | \n",
+ " vestis texas limited liability company | \n",
+ " vestis texas | \n",
+ " FSTS TKSS | \n",
+ " 0001967649 | \n",
+ "
\n",
+ " \n",
+ " 1055985 | \n",
+ " 0001967649_vestis west adams limited liability... | \n",
+ " vestis (west adams), llc | \n",
+ " delaware | \n",
+ " NaN | \n",
+ " edgar/data/1967649/0001967649-23-000025.txt | \n",
+ " 2023-12-21 | \n",
+ " 2023 | \n",
+ " vestis west adams limited liability company | \n",
+ " vestis west adams | \n",
+ " FSTS WST ATMS | \n",
+ " 0001967649 | \n",
+ "
\n",
+ " \n",
+ " 1055986 | \n",
+ " 0001978811_gouverneur savings and loan associa... | \n",
+ " gouverneur savings and loan association | \n",
+ " new york | \n",
+ " 100.0 | \n",
+ " edgar/data/1978811/0001558370-23-020009.txt | \n",
+ " 2023-12-26 | \n",
+ " 2023 | \n",
+ " gouverneur savings and loan association | \n",
+ " gouverneur savings and loan | \n",
+ " KFRNR SFNKS ANT LN | \n",
+ " 0001978811 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1055987 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sec_company_id \\\n",
+ "0 0000000020_colormax limited_united kingdom \n",
+ "1 0000000020_gundlach equipment corporation_dela... \n",
+ "2 0000000020_jeffrey rader ab_sweden \n",
+ "3 0000000020_jeffrey rader canada company_canada \n",
+ "4 0000000020_jeffrey rader corporation_delaware \n",
+ "... ... \n",
+ "1055982 0001967649_vestis supply chain limited liabili... \n",
+ "1055983 0001967649_vestis syracuse limited liability c... \n",
+ "1055984 0001967649_vestis texas limited liability comp... \n",
+ "1055985 0001967649_vestis west adams limited liability... \n",
+ "1055986 0001978811_gouverneur savings and loan associa... \n",
+ "\n",
+ " company_name_raw location_of_inc own_per \\\n",
+ "0 colormax limited united kingdom NaN \n",
+ "1 gundlach equipment corporation delaware NaN \n",
+ "2 jeffrey rader ab sweden NaN \n",
+ "3 jeffrey rader canada company canada NaN \n",
+ "4 jeffrey rader corporation delaware NaN \n",
+ "... ... ... ... \n",
+ "1055982 vestis (supply chain), llc delaware NaN \n",
+ "1055983 vestis (syracuse), llc delaware NaN \n",
+ "1055984 vestis (texas), llc delaware NaN \n",
+ "1055985 vestis (west adams), llc delaware NaN \n",
+ "1055986 gouverneur savings and loan association new york 100.0 \n",
+ "\n",
+ " filename report_date report_year \\\n",
+ "0 edgar/data/20/0000893220-06-000650.txt 2006-03-23 2006 \n",
+ "1 edgar/data/20/0000950123-10-024631.txt 2010-03-15 2010 \n",
+ "2 edgar/data/20/0000950123-10-024631.txt 2010-03-15 2010 \n",
+ "3 edgar/data/20/0000950123-10-024631.txt 2010-03-15 2010 \n",
+ "4 edgar/data/20/0000950123-10-024631.txt 2010-03-15 2010 \n",
+ "... ... ... ... \n",
+ "1055982 edgar/data/1967649/0001967649-23-000025.txt 2023-12-21 2023 \n",
+ "1055983 edgar/data/1967649/0001967649-23-000025.txt 2023-12-21 2023 \n",
+ "1055984 edgar/data/1967649/0001967649-23-000025.txt 2023-12-21 2023 \n",
+ "1055985 edgar/data/1967649/0001967649-23-000025.txt 2023-12-21 2023 \n",
+ "1055986 edgar/data/1978811/0001558370-23-020009.txt 2023-12-26 2023 \n",
+ "\n",
+ " company_name \\\n",
+ "0 colormax limited \n",
+ "1 gundlach equipment corporation \n",
+ "2 jeffrey rader ab \n",
+ "3 jeffrey rader canada company \n",
+ "4 jeffrey rader corporation \n",
+ "... ... \n",
+ "1055982 vestis supply chain limited liability company \n",
+ "1055983 vestis syracuse limited liability company \n",
+ "1055984 vestis texas limited liability company \n",
+ "1055985 vestis west adams limited liability company \n",
+ "1055986 gouverneur savings and loan association \n",
+ "\n",
+ " company_name_no_legal company_name_mphone parent_company_cik \n",
+ "0 colormax KLRMKS 0000000020 \n",
+ "1 gundlach equipment KNTLX EKPMNT 0000000020 \n",
+ "2 jeffrey rader ab JFR RTR AB 0000000020 \n",
+ "3 jeffrey rader canada JFR RTR KNT 0000000020 \n",
+ "4 jeffrey rader JFR RTR 0000000020 \n",
+ "... ... ... ... \n",
+ "1055982 vestis supply chain FSTS SPL XN 0001967649 \n",
+ "1055983 vestis syracuse FSTS SRKS 0001967649 \n",
+ "1055984 vestis texas FSTS TKSS 0001967649 \n",
+ "1055985 vestis west adams FSTS WST ATMS 0001967649 \n",
+ "1055986 gouverneur savings and loan KFRNR SFNKS ANT LN 0001978811 \n",
+ "\n",
+ "[1055987 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ex21_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "304d929b-ce6c-4508-b511-475f287a6b37",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "merged_df = ex21_df.merge(\n",
+ " eia_df.drop_duplicates(subset=\"company_name\")[[\"company_name\", \"utility_id_eia\"]], how=\"left\", on=\"company_name\", suffixes=(\"_ex21\", \"_eia\")\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "d315f8d5-7166-4161-bc4e-79c45ed3ad59",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1055987, 20821)"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(ex21_df), len(eia_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "3aae6d2c-a941-478e-8178-84cf1321e0b3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "utility_id_eia\n",
+ "True 1050887\n",
+ "False 5100\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_df.utility_id_eia.isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "6aba0ae8-a8ee-47ef-8eb9-a0ef9f283b51",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1675"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(merged_df.utility_id_eia.unique())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8d178634-b494-4769-93e3-c0213e4a0326",
+ "metadata": {},
+ "source": [
+ "### Read in SEC output table"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "25e8183d-3248-440c-aa4e-e7ee7db4c487",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# review outputs from Dagster\n",
+ "sec_out_df = pd.read_parquet(UPath(\"gs://sec10k-outputs/v2/out_sec_10k__parents_and_subsidiaries/2023q1.parquet\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "3881bfbd-cdc3-4f9c-92af-9e74d7758e51",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sec_company_id | \n",
+ " filename | \n",
+ " business_phone | \n",
+ " central_index_key | \n",
+ " city | \n",
+ " company_name | \n",
+ " date_of_name_change | \n",
+ " film_number | \n",
+ " fiscal_year_end | \n",
+ " form_type | \n",
+ " ... | \n",
+ " street_1 | \n",
+ " street_2 | \n",
+ " zip | \n",
+ " report_date | \n",
+ " report_year | \n",
+ " location_of_inc | \n",
+ " company_name_clean | \n",
+ " parent_company_cik | \n",
+ " own_per | \n",
+ " files_10k | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0000001800 | \n",
+ " edgar/data/1800/0001628280-23-004026.txt | \n",
+ " 2246676100 | \n",
+ " 0000001800 | \n",
+ " abbott park | \n",
+ " abbott laboratories | \n",
+ " None | \n",
+ " 23642562 | \n",
+ " 1231 | \n",
+ " 10-k | \n",
+ " ... | \n",
+ " 100 abbott park road | \n",
+ " None | \n",
+ " 60064-3500 | \n",
+ " 2023-02-17 | \n",
+ " 2023 | \n",
+ " illinois | \n",
+ " abbott laboratories | \n",
+ " None | \n",
+ " None | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0000001800_3a nutrition (vietnam) company limi... | \n",
+ " edgar/data/1800/0001628280-23-004026.txt | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " 3a nutrition (vietnam) company limited | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " ... | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " 2023-02-17 | \n",
+ " 2023 | \n",
+ " viet nam | \n",
+ " 3a nutrition vietnam company limited | \n",
+ " 0000001800 | \n",
+ " None | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0000001800_abbott (jiaxing) nutrition co., ltd... | \n",
+ " edgar/data/1800/0001628280-23-004026.txt | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " abbott (jiaxing) nutrition co., ltd | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " ... | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " 2023-02-17 | \n",
+ " 2023 | \n",
+ " china | \n",
+ " abbott jiaxing nutrition co limited | \n",
+ " 0000001800 | \n",
+ " None | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0000001800_abbott (shanghai) diagnostics sales... | \n",
+ " edgar/data/1800/0001628280-23-004026.txt | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " abbott (shanghai) diagnostics sales co., ltd | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " ... | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " 2023-02-17 | \n",
+ " 2023 | \n",
+ " china | \n",
+ " abbott shanghai diagnostics sales co limited | \n",
+ " 0000001800 | \n",
+ " None | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0000001800_abbott (uk) finance limited_united ... | \n",
+ " edgar/data/1800/0001628280-23-004026.txt | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " abbott (uk) finance limited | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " ... | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " 2023-02-17 | \n",
+ " 2023 | \n",
+ " united kingdom | \n",
+ " abbott uk finance limited | \n",
+ " 0000001800 | \n",
+ " None | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 171358 | \n",
+ " 0001951118 | \n",
+ " edgar/data/1951118/0001853620-23-000117.txt | \n",
+ " (248) 991-6700 | \n",
+ " 0001951118 | \n",
+ " farmington hills | \n",
+ " mercedes-benz auto receivables trust 2022-1 | \n",
+ " None | \n",
+ " 23764946 | \n",
+ " 1231 | \n",
+ " 10-k | \n",
+ " ... | \n",
+ " 35555 w. twelve mile rd. | \n",
+ " suite 100 | \n",
+ " 48331 | \n",
+ " 2023-03-27 | \n",
+ " 2023 | \n",
+ " delaware | \n",
+ " mercedes benz auto receivables trust 2022 1 | \n",
+ " None | \n",
+ " None | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 171359 | \n",
+ " 0001951752 | \n",
+ " edgar/data/1951752/0001951752-23-000016.txt | \n",
+ " 3135943495 | \n",
+ " 0001951752 | \n",
+ " dearborn | \n",
+ " ford credit auto owner trust 2022-d | \n",
+ " None | \n",
+ " 23751556 | \n",
+ " 1231 | \n",
+ " 10-k | \n",
+ " ... | \n",
+ " c/o ford motor co , whq ste 801-c1 | \n",
+ " one american road | \n",
+ " 48126 | \n",
+ " 2023-03-22 | \n",
+ " 2023 | \n",
+ " None | \n",
+ " ford credit auto owner trust 2022 d | \n",
+ " None | \n",
+ " None | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 171360 | \n",
+ " 0001954336 | \n",
+ " edgar/data/1477336/0001954336-23-000024.txt | \n",
+ " 313-656-5500 | \n",
+ " 0001954336 | \n",
+ " wilmington | \n",
+ " ally auto receivables trust 2022-3 | \n",
+ " None | \n",
+ " 23759320 | \n",
+ " 1231 | \n",
+ " 10-k | \n",
+ " ... | \n",
+ " 1209 orange street | \n",
+ " None | \n",
+ " 19801 | \n",
+ " 2023-03-24 | \n",
+ " 2023 | \n",
+ " delaware | \n",
+ " ally auto receivables trust 2022 3 | \n",
+ " None | \n",
+ " None | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 171361 | \n",
+ " 0001954436 | \n",
+ " edgar/data/1954436/0000929638-23-001050.txt | \n",
+ " (214) 572-8276 | \n",
+ " 0001954436 | \n",
+ " irving | \n",
+ " exeter automobile receivables trust 2022-6 | \n",
+ " None | \n",
+ " 23784761 | \n",
+ " 1231 | \n",
+ " 10-k | \n",
+ " ... | \n",
+ " 2101 w. john carpenter freeway | \n",
+ " None | \n",
+ " 75063 | \n",
+ " 2023-03-31 | \n",
+ " 2023 | \n",
+ " delaware | \n",
+ " exeter automobile receivables trust 2022 6 | \n",
+ " None | \n",
+ " None | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 171362 | \n",
+ " 0001955010 | \n",
+ " edgar/data/1955010/0001140361-23-012122.txt | \n",
+ " 212-326-1500 | \n",
+ " 0001955010 | \n",
+ " new york | \n",
+ " oha senior private lending fund (u) llc | \n",
+ " None | \n",
+ " 23740150 | \n",
+ " 1231 | \n",
+ " 10-k | \n",
+ " ... | \n",
+ " one vanderbilt, 16th floor | \n",
+ " None | \n",
+ " 10017 | \n",
+ " 2023-03-17 | \n",
+ " 2023 | \n",
+ " delaware | \n",
+ " oha senior private lending fund u limited liab... | \n",
+ " None | \n",
+ " None | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
171363 rows × 27 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sec_company_id \\\n",
+ "0 0000001800 \n",
+ "1 0000001800_3a nutrition (vietnam) company limi... \n",
+ "2 0000001800_abbott (jiaxing) nutrition co., ltd... \n",
+ "3 0000001800_abbott (shanghai) diagnostics sales... \n",
+ "4 0000001800_abbott (uk) finance limited_united ... \n",
+ "... ... \n",
+ "171358 0001951118 \n",
+ "171359 0001951752 \n",
+ "171360 0001954336 \n",
+ "171361 0001954436 \n",
+ "171362 0001955010 \n",
+ "\n",
+ " filename business_phone \\\n",
+ "0 edgar/data/1800/0001628280-23-004026.txt 2246676100 \n",
+ "1 edgar/data/1800/0001628280-23-004026.txt None \n",
+ "2 edgar/data/1800/0001628280-23-004026.txt None \n",
+ "3 edgar/data/1800/0001628280-23-004026.txt None \n",
+ "4 edgar/data/1800/0001628280-23-004026.txt None \n",
+ "... ... ... \n",
+ "171358 edgar/data/1951118/0001853620-23-000117.txt (248) 991-6700 \n",
+ "171359 edgar/data/1951752/0001951752-23-000016.txt 3135943495 \n",
+ "171360 edgar/data/1477336/0001954336-23-000024.txt 313-656-5500 \n",
+ "171361 edgar/data/1954436/0000929638-23-001050.txt (214) 572-8276 \n",
+ "171362 edgar/data/1955010/0001140361-23-012122.txt 212-326-1500 \n",
+ "\n",
+ " central_index_key city \\\n",
+ "0 0000001800 abbott park \n",
+ "1 None None \n",
+ "2 None None \n",
+ "3 None None \n",
+ "4 None None \n",
+ "... ... ... \n",
+ "171358 0001951118 farmington hills \n",
+ "171359 0001951752 dearborn \n",
+ "171360 0001954336 wilmington \n",
+ "171361 0001954436 irving \n",
+ "171362 0001955010 new york \n",
+ "\n",
+ " company_name date_of_name_change \\\n",
+ "0 abbott laboratories None \n",
+ "1 3a nutrition (vietnam) company limited None \n",
+ "2 abbott (jiaxing) nutrition co., ltd None \n",
+ "3 abbott (shanghai) diagnostics sales co., ltd None \n",
+ "4 abbott (uk) finance limited None \n",
+ "... ... ... \n",
+ "171358 mercedes-benz auto receivables trust 2022-1 None \n",
+ "171359 ford credit auto owner trust 2022-d None \n",
+ "171360 ally auto receivables trust 2022-3 None \n",
+ "171361 exeter automobile receivables trust 2022-6 None \n",
+ "171362 oha senior private lending fund (u) llc None \n",
+ "\n",
+ " film_number fiscal_year_end form_type ... \\\n",
+ "0 23642562 1231 10-k ... \n",
+ "1 None None None ... \n",
+ "2 None None None ... \n",
+ "3 None None None ... \n",
+ "4 None None None ... \n",
+ "... ... ... ... ... \n",
+ "171358 23764946 1231 10-k ... \n",
+ "171359 23751556 1231 10-k ... \n",
+ "171360 23759320 1231 10-k ... \n",
+ "171361 23784761 1231 10-k ... \n",
+ "171362 23740150 1231 10-k ... \n",
+ "\n",
+ " street_1 street_2 zip \\\n",
+ "0 100 abbott park road None 60064-3500 \n",
+ "1 None None None \n",
+ "2 None None None \n",
+ "3 None None None \n",
+ "4 None None None \n",
+ "... ... ... ... \n",
+ "171358 35555 w. twelve mile rd. suite 100 48331 \n",
+ "171359 c/o ford motor co , whq ste 801-c1 one american road 48126 \n",
+ "171360 1209 orange street None 19801 \n",
+ "171361 2101 w. john carpenter freeway None 75063 \n",
+ "171362 one vanderbilt, 16th floor None 10017 \n",
+ "\n",
+ " report_date report_year location_of_inc \\\n",
+ "0 2023-02-17 2023 illinois \n",
+ "1 2023-02-17 2023 viet nam \n",
+ "2 2023-02-17 2023 china \n",
+ "3 2023-02-17 2023 china \n",
+ "4 2023-02-17 2023 united kingdom \n",
+ "... ... ... ... \n",
+ "171358 2023-03-27 2023 delaware \n",
+ "171359 2023-03-22 2023 None \n",
+ "171360 2023-03-24 2023 delaware \n",
+ "171361 2023-03-31 2023 delaware \n",
+ "171362 2023-03-17 2023 delaware \n",
+ "\n",
+ " company_name_clean parent_company_cik \\\n",
+ "0 abbott laboratories None \n",
+ "1 3a nutrition vietnam company limited 0000001800 \n",
+ "2 abbott jiaxing nutrition co limited 0000001800 \n",
+ "3 abbott shanghai diagnostics sales co limited 0000001800 \n",
+ "4 abbott uk finance limited 0000001800 \n",
+ "... ... ... \n",
+ "171358 mercedes benz auto receivables trust 2022 1 None \n",
+ "171359 ford credit auto owner trust 2022 d None \n",
+ "171360 ally auto receivables trust 2022 3 None \n",
+ "171361 exeter automobile receivables trust 2022 6 None \n",
+ "171362 oha senior private lending fund u limited liab... None \n",
+ "\n",
+ " own_per files_10k \n",
+ "0 None True \n",
+ "1 None False \n",
+ "2 None False \n",
+ "3 None False \n",
+ "4 None False \n",
+ "... ... ... \n",
+ "171358 None True \n",
+ "171359 None True \n",
+ "171360 None True \n",
+ "171361 None True \n",
+ "171362 None True \n",
+ "\n",
+ "[171363 rows x 27 columns]"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_out_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3447dcdb-4506-4de0-9201-9711ff9259ee",
+ "metadata": {},
+ "source": [
+ "### There are a combination of SEC 10K filers and subsidiary companies:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "0d654dfc-2fb2-41d3-9ff8-6fe70732a04a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "files_10k\n",
+ "False 165824\n",
+ "True 5539\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_out_df.files_10k.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6797b5b7-be91-430a-a30c-cc26c62aa7b1",
+ "metadata": {},
+ "source": [
+ "### `sec_company_id` and `central_index_key` should be unique:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "67e0e789-feb0-4866-ba82-8346c62c1bef",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_out_df.sec_company_id.is_unique"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "053d65c9-dbdd-4622-a4ee-badc7db2a88d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_out_df.central_index_key.dropna().is_unique"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b7e05e03-fa05-4655-a085-c66afcfba442",
+ "metadata": {},
+ "source": [
+ "### Location of incorporation should be clean and standardized for filers and subsidiaries."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "cb33b703-be24-4ddc-a9f2-148850c3f4af",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "location_of_inc\n",
+ "delaware 3076\n",
+ "nevada 300\n",
+ "maryland 299\n",
+ "cayman islands 135\n",
+ "north carolina 92\n",
+ "new york 74\n",
+ "florida 74\n",
+ "pennsylvania 71\n",
+ "california 57\n",
+ "texas 56\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_out_df[sec_out_df.files_10k][\"location_of_inc\"].value_counts().head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "cb6fc7b5-b9c0-46ae-991c-cae41f86e8f3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "location_of_inc\n",
+ "bahamas 1\n",
+ "germany 1\n",
+ "hong kong 1\n",
+ "china 1\n",
+ "virgin islands, u.s. 1\n",
+ "quebec, canada 1\n",
+ "new brunswick, canada 1\n",
+ "new hampshire 1\n",
+ "netherlands antilles 1\n",
+ "malaysia 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_out_df[sec_out_df.files_10k][\"location_of_inc\"].value_counts().tail(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "5373ced2-75e9-4229-b927-3ad4b8d33e39",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "location_of_inc\n",
+ "delaware 67546\n",
+ "united kingdom 4979\n",
+ "cayman islands 3000\n",
+ "texas 2881\n",
+ "netherlands 2615\n",
+ "california 2566\n",
+ "germany 2381\n",
+ "china 2305\n",
+ "florida 2130\n",
+ "australia 1938\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_out_df[~sec_out_df.files_10k][\"location_of_inc\"].value_counts().head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "3ceb1aa2-c622-4a97-9293-281325637f09",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "location_of_inc\n",
+ "ontario, can 1\n",
+ "british col, can 1\n",
+ "hong kong china china 1\n",
+ "zhongshan, china 1\n",
+ "jacksonville, florida 1\n",
+ "toronto, ontario, canada 1\n",
+ "limassol, cyprus 1\n",
+ "doncaster, syorkshire, uk 1\n",
+ "manchester, england 1\n",
+ "cote 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_out_df[~sec_out_df.files_10k][\"location_of_inc\"].value_counts().tail(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "95d51bdb-c378-45bc-9848-4a2a8895b470",
+ "metadata": {},
+ "source": [
+ "### All non SEC 10K filers should have a `parent_company_cik`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "89cd6bdb-a06c-40ae-8b49-c610e769f9c8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "parent_company_cik\n",
+ "False 165824\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_out_df[~sec_out_df.files_10k][\"parent_company_cik\"].isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8f4bd494-951f-417f-ba56-fa0202d741a5",
+ "metadata": {},
+ "source": [
+ "### When run on all year quarters, all `parent_company_cik` should appear in `central_index_key` column"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "d024bc29-d0b1-45cd-a0a2-c9b66e73e0d6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2954"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "n_parent_company_cik = len(set(sec_out_df.parent_company_cik))\n",
+ "n_parent_company_cik"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "d70660f2-559e-4ec1-8167-1bfdce45c287",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2832"
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "n_overlap = len(set(sec_out_df.parent_company_cik).intersection(set(sec_out_df.central_index_key)))\n",
+ "n_overlap"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "0eb86d64-5ca0-423a-864c-dbfb00b5b9fa",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "122"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "n_parent_company_cik - n_overlap"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "60366af2-259a-4a87-a93f-2180d8777c67",
+ "metadata": {},
+ "source": [
+ "### There should be filer companies that have a `parent_company_cik` because they were matched to a subsidiary"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "b5c53dab-3be5-48f1-90f6-583acfb452ab",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "parent_company_cik\n",
+ "True 5474\n",
+ "False 65\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_out_df[sec_out_df.files_10k].parent_company_cik.isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5358a4e1-38a7-489d-bf1a-f53de58447ba",
+ "metadata": {},
+ "source": [
+ "### There should be no non-filer companies that have a CIK"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "4a19df26-79c3-4aa1-bcbf-916b822346ca",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "central_index_key\n",
+ "True 165824\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_out_df[~sec_out_df.files_10k].central_index_key.isnull().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bde4f03f-e5b0-4895-ade6-ae44b260e78e",
+ "metadata": {},
+ "source": [
+ "### There should be no duplicated `company_name`, `location_of_inc`, `parent_company_cik` records"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "ca87709a-daa7-4396-83a4-0f5bb8ec2cd4",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sec_company_id | \n",
+ " filename | \n",
+ " business_phone | \n",
+ " central_index_key | \n",
+ " city | \n",
+ " company_name | \n",
+ " date_of_name_change | \n",
+ " film_number | \n",
+ " fiscal_year_end | \n",
+ " form_type | \n",
+ " ... | \n",
+ " street_1 | \n",
+ " street_2 | \n",
+ " zip | \n",
+ " report_date | \n",
+ " report_year | \n",
+ " location_of_inc | \n",
+ " company_name_clean | \n",
+ " parent_company_cik | \n",
+ " own_per | \n",
+ " files_10k | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0 rows × 27 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [sec_company_id, filename, business_phone, central_index_key, city, company_name, date_of_name_change, film_number, fiscal_year_end, form_type, former_conformed_name, irs_number, sec_act, sec_file_number, standard_industrial_classification, state, state_of_incorporation, street_1, street_2, zip, report_date, report_year, location_of_inc, company_name_clean, parent_company_cik, own_per, files_10k]\n",
+ "Index: []\n",
+ "\n",
+ "[0 rows x 27 columns]"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_out_df[sec_out_df.duplicated(subset=[\"company_name\", \"location_of_inc\", \"parent_company_cik\"])]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bca9e395-bd96-4183-b299-46cd589d97d5",
+ "metadata": {},
+ "source": [
+ "### There can be companies with the same name, location, and CIK, but different parent companies."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "cc1880f3-a9d3-4f8a-a42b-2f9ff428ca45",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sec_out_df = sec_out_df.fillna({\"central_index_key\": pd.NA})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "f87257df-00f7-48a8-882a-fb1ea8c27e18",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " company_name | \n",
+ " location_of_inc | \n",
+ " central_index_key | \n",
+ " parent_company_cik | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [company_name, location_of_inc, central_index_key, parent_company_cik]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 59,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sec_out_df[~sec_out_df.central_index_key.isnull() \n",
+ " & (sec_out_df.duplicated(\n",
+ " subset=[\"company_name\", \"location_of_inc\", \"central_index_key\"], keep=False\n",
+ " ))][[\"company_name\", \"location_of_inc\", \"central_index_key\", \"parent_company_cik\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e2169181-dcd8-4b43-b03e-9526f597147d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "mozilla_sec_eia",
+ "language": "python",
+ "name": "mozilla_sec_eia"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pyproject.toml b/pyproject.toml
index b4bc1f2..72536e5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,11 +14,12 @@ readme = {file = "README.rst", content-type = "text/x-rst"}
authors = [
{name = "Catalyst Cooperative", email = "pudl@catalyst.coop"}
]
-requires-python = ">=3.10,<3.12"
+requires-python = ">=3.10,<=3.12"
dynamic = ["version"]
license = {file = "LICENSE.txt"}
dependencies = [
"accelerate>=0.21.0,<2.0", # Hugging Face dependency for PyTorch models
+ "catalystcoop.pudl @ git+https://github.com/catalyst-cooperative/pudl.git",
"cloud-sql-python-connector[pg8000]",
"dagster>=1.7.15", # 1.7.13 & 1.7.14 were both breaking things
"dagster-mlflow",
@@ -30,6 +31,7 @@ dependencies = [
"google-cloud-secret-manager>=2,<3",
"google-cloud-storage>=2,<3",
"hypothesis",
+ "jellyfish>=1.1",
"matplotlib>=3.8,<4",
"mlflow>=2.12",
"opencv-python",
@@ -44,6 +46,7 @@ dependencies = [
"pydantic-settings>=2",
"python-bidi<0.7.0",
"pymupdf", # Convert PDF to image
+ "splink>=4,<5",
"sqlalchemy>=2,<3",
"timm>0.9,<2", # dependency for Hugging Face computer vision models
"torch>=2.2,<3",
@@ -61,6 +64,7 @@ classifiers = [
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
]
keywords = [
"template",
@@ -92,9 +96,9 @@ dev = [
docs = [
"doc8>=1,<2", # Ensures clean documentation formatting
"furo>=2022.4.7",
- "sphinx>=6,<8.2", # The default Python documentation engine
+ "sphinx>=6,<8.1", # The default Python documentation engine
"sphinx-autoapi>=2,<4", # Generates documentation from docstrings
- "sphinx-issues>=1.2,<6", # Allows references to GitHub issues
+ "sphinx-issues>=5", # Allows references to GitHub issues
]
tests = [
@@ -200,8 +204,8 @@ lint.ignore = [
"EXE002",
]
-# Assume Python 3.11
-target-version = "py311"
+# Assume Python 3.12
+target-version = "py312"
line-length = 88
# Don't automatically concatenate strings -- sometimes we forget a comma!
@@ -230,6 +234,6 @@ inline-quotes = "double"
multiline-quotes = "double"
[tool.mypy]
-python_version = "3.10"
+python_version = "3.12"
warn_return_any = true
warn_unused_configs = true
diff --git a/src/mozilla_sec_eia/library/record_linkage_utils.py b/src/mozilla_sec_eia/library/record_linkage_utils.py
new file mode 100644
index 0000000..217fb9b
--- /dev/null
+++ b/src/mozilla_sec_eia/library/record_linkage_utils.py
@@ -0,0 +1,170 @@
+"""Utility functions for cleaning strings during modeling preprocessing steps."""
+
+import json
+from enum import StrEnum
+from importlib import resources
+
+import jellyfish
+import pandas as pd
+
+from pudl.analysis.record_linkage import name_cleaner
+
+INVALID_NAMES = [
+ "llc",
+ "limited liability company",
+ "limited",
+ "ltd",
+ "iiii",
+ "inc",
+ "incorporated",
+ "partnership",
+ "i",
+ "name",
+ "company",
+ "&",
+ "",
+]
+
+company_name_cleaner = name_cleaner.CompanyNameCleaner(
+ cleaning_rules_list=[
+ "remove_word_the_from_the_end",
+ "remove_word_the_from_the_beginning",
+ "replace_ampersand_by_AND",
+ "replace_hyphen_by_space",
+ "replace_underscore_by_space",
+ "remove_text_punctuation",
+ "remove_parentheses",
+ "remove_brackets",
+ "remove_curly_brackets",
+ "enforce_single_space_between_words",
+ ]
+)
+
+legal_term_remover = name_cleaner.CompanyNameCleaner(
+ cleaning_rules_list=[], handle_legal_terms=2
+)
+
+
+def clean_company_name(
+ df: pd.DataFrame, col_name: str = "company_name"
+) -> pd.DataFrame:
+ """Conduct cleaning on a company name column and add column without legal terms.
+
+ Uses the PUDL name cleaner object to do basic cleaning on `col_name` column
+ such as stripping punctuation, correcting case, normalizing legal
+ terms etc. The clean column becomes the `col_name` column and the original
+ `col_name` column is renamed to `{col_name}_raw`. Also adds a column called
+ `{col_name}_no_legal` which has legal terms stripped from the clean strings.
+
+ Arguments:
+ df: The dataframe that is to be cleaned. Must contain `col_name` column.
+ col_name: The name of the column with the company name strings.
+
+ Returns:
+ pd.DataFrame: The original dataframe with `col_name` now containing
+ cleaned strings and an additional column with the raw strings
+ and a column with the legal terms stripped from the company name.
+ """
+ df[col_name] = df[col_name].fillna(pd.NA).str.strip().str.lower().replace("", pd.NA)
+ df.loc[:, f"{col_name}_clean"] = company_name_cleaner.apply_name_cleaning(
+ df[[col_name]]
+ ).str.strip()
+ df = df[df[f"{col_name}_clean"] != ""]
+ df = df.rename(columns={col_name: f"{col_name}_raw"}).rename(
+ columns={f"{col_name}_clean": col_name}
+ )
+ df.loc[:, f"{col_name}_no_legal"] = legal_term_remover.apply_name_cleaning(
+ df[[col_name]]
+ )
+ return df
+
+
+def handle_invalid_names(
+ df: pd.DataFrame, col_name: str = "company_name", drop_invalid: bool = True
+) -> pd.DataFrame:
+ """Drop rows that have invalid company names, like just 'llc', or 'partnership'.
+
+ Either drop invalid company name values or fill with the empty string. Invalid
+ values are contained in `INVALID_NAMES`.
+ """
+ if drop_invalid:
+ return df[(~df[col_name].isin(INVALID_NAMES))]
+ df[col_name] = df[col_name].where(~df[col_name].isin(INVALID_NAMES), "")
+ return df
+
+
+def flatten_companies_across_time(
+ df: pd.DataFrame, key_cols: list[str], date_col: str = "report_date"
+) -> pd.DataFrame:
+ """Keep only the most recent record for each group of `key_cols`.
+
+ Dataframe must have all of `key_cols` and `date_col`.
+ """
+ df = (
+ df.sort_values(by=date_col, ascending=False).groupby(key_cols).first()
+ ).reset_index()
+ return df
+
+
+# TODO: this is in PUDL, deduplicate
+def get_metaphone_col(col: pd.Series) -> pd.Series:
+ """Get the metaphones of the strings in a column."""
+ return col.apply(jellyfish.metaphone)
+
+
+class HandleNulls(StrEnum):
+ """Enum for handling null values in company name transform."""
+
+ DROP = "drop"
+ FILL_EMPTY_STR = "fill_empty_str"
+
+
+def transform_company_name(
+ df: pd.DataFrame,
+ col_name: str = "company_name",
+ handle_nulls: HandleNulls = HandleNulls.DROP,
+) -> pd.DataFrame:
+ """Apply cleaning, get metaphone col, drop invalid rows."""
+ df = clean_company_name(df, col_name=col_name)
+ if handle_nulls == HandleNulls.DROP:
+ df = handle_invalid_names(df, col_name, drop_invalid=True)
+ df = df[~df[col_name].isnull()]
+ elif handle_nulls == HandleNulls.FILL_EMPTY_STR:
+ df = handle_invalid_names(df, col_name, drop_invalid=False)
+ df = df.fillna({col_name: ""})
+ df.loc[:, f"{col_name}_mphone"] = get_metaphone_col(df[f"{col_name}_no_legal"])
+
+ return df
+
+
+def fill_street_address_nulls(
+ df: pd.DataFrame,
+ address_col: str = "street_address",
+ secondary_address_col: str = "street_address_2",
+) -> pd.DataFrame:
+ """Fill null street address with value from secondary address column."""
+ df[address_col] = df[address_col].where(
+ (~df[address_col].isnull()) | (df[secondary_address_col].isnull()),
+ df[secondary_address_col],
+ )
+ return df
+
+
+def expand_street_name_abbreviations(col: pd.Series) -> pd.Series:
+ """Standardize street address suffixes, like street to st.
+
+ Expects lower case strings in column.
+ """
+ # remove punctuation from column first
+ col = col.str.replace(r"[^\w\s]", "", regex=True)
+
+ json_source = (
+ resources.files("mozilla_sec_eia.package_data")
+ / "street_suffix_abbreviations.json"
+ )
+ with json_source.open() as f:
+ address_expansions = json.load(f)
+ for standard_abbr, suffix_list in address_expansions.items():
+ pattern = r"\b(" + "|".join(suffix_list) + r")\b"
+ col = col.str.replace(pattern, standard_abbr, regex=True)
+ return col
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
new file mode 100644
index 0000000..7e2852f
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -0,0 +1,1097 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "0da8c588-2d09-464b-945f-168704c0cdac",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "# Exhibit 21 extraction\n",
+ "\n",
+ "This notebook implements a model built on top of [layoutlmv3](https://huggingface.co/microsoft/layoutlmv3-base/tree/main)\n",
+ "from Exhibit 21 attachments to SEC-10k filings. These documents contain a list of all subsidiary companies owned by a filing\n",
+ "company."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "84aab877-9d59-4ec7-bf4b-c75e216fb1d6",
+ "metadata": {},
+ "source": [
+ "## Load upstream assets and configuration\n",
+ "The following cell can be run interactively to set configuration and load upstream assets. When running the notebook in dagster, this cell will be replaced with assets from the dagster run and dagster run configuration.\n",
+ "\n",
+ "### Config\n",
+ "- `layoutlm_uri`: If `None` the notebook will finetune layoutlm using `ex21_training_data`. If `layoutlm_uri` points to a valid model on the mlflow tracking server, the notebook will use the pre-trained model and perform inference on the validation set, logging validation metrics to a child run nested under the mlflow run associated with the pretrained model.\n",
+ "\n",
+ "### Upstream assets\n",
+ "We are using dagster assets to construct training/validation data outside the notebook to allow for easy caching. These datasets are fairly compute intensive to create, so this is useful when iterating on the model using the same data.\n",
+ "\n",
+ "NOTE: The notebook will load the most recent version of these assets, so to update the training/validation data you must rerun the dagster assets with desired configuration.\n",
+ "\n",
+ "- `ex21_training_data`: Dataset containing labeled data produced in label-studio to train `layoutlm`\n",
+ "- `ex21_validation_set`: Labeled validation data describing expected inference output on validation filings\n",
+ "- `ex21_failed_parsing_metadata`: Metadata for any validation filings that couldn't be parsed (usually empty)\n",
+ "- `ex21_inference_dataset`: Parsed validation filings prepped for inference model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "48f185de-95ef-4194-9245-93f8d603d2e6",
+ "metadata": {
+ "tags": [
+ "parameters"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "import dagstermill\n",
+ "\n",
+ "from mozilla_sec_eia.models.sec10k import defs\n",
+ "\n",
+ "context = dagstermill.get_context(op_config={\n",
+ " \"layoutlm_training_run\": \"layoutlm-labeledv0.2\",\n",
+ "})\n",
+ "\n",
+ "ex21_training_data = defs.load_asset_value(\"ex21_training_data\")\n",
+ "\n",
+ "ex21_failed_parsing_metadata = defs.load_asset_value(\"ex21_failed_parsing_metadata\")\n",
+ "ex21_inference_dataset = defs.load_asset_value(\"ex21_inference_dataset\")\n",
+ "ex21_validation_set = defs.load_asset_value(\"ex21_validation_set\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7f299b2b-2358-4526-b023-f29c817316d9",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Train Layoutlmv3"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "32edcce1-ab18-40b6-9da8-ce0ea53c2f72",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Define training metrics\n",
+ "The method `compute_metrics` will be used to score the model. It computes precision, recall, f1 score, and accuracy on bounding box labels output by `layoutlm`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "9372b908-d9b9-4d18-a5bf-d332648b3e49",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from tempfile import TemporaryDirectory\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "from mozilla_sec_eia.models.sec10k.utils.cloud import get_metadata_filename\n",
+ "\n",
+ "\n",
+ "def compute_metrics(p, metric, label_list, return_entity_level_metrics=False):\n",
+ " \"\"\"Compute metrics to train and evaluate the model on.\"\"\"\n",
+ " predictions, labels = p\n",
+ " predictions = np.argmax(predictions, axis=2)\n",
+ "\n",
+ " # Remove ignored index (special tokens)\n",
+ " true_predictions = [\n",
+ " [label_list[pred] for (pred, lab) in zip(prediction, label) if lab != -100]\n",
+ " for prediction, label in zip(predictions, labels)\n",
+ " ]\n",
+ " true_labels = [\n",
+ " [label_list[lab] for (pred, lab) in zip(prediction, label) if lab != -100]\n",
+ " for prediction, label in zip(predictions, labels)\n",
+ " ]\n",
+ "\n",
+ " results = metric.compute(predictions=true_predictions, references=true_labels)\n",
+ " if return_entity_level_metrics:\n",
+ " # Unpack nested dictionaries\n",
+ " final_results = {}\n",
+ " for key, value in results.items():\n",
+ " if isinstance(value, dict):\n",
+ " for n, v in value.items():\n",
+ " final_results[f\"{key}_{n}\"] = v\n",
+ " else:\n",
+ " final_results[key] = value\n",
+ " return final_results\n",
+ " return {\n",
+ " \"precision\": results[\"overall_precision\"],\n",
+ " \"recall\": results[\"overall_recall\"],\n",
+ " \"f1\": results[\"overall_f1\"],\n",
+ " \"accuracy\": results[\"overall_accuracy\"],\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8160263c-8f69-437c-918b-e56ad007961a",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "#### Finetune Model\n",
+ "The next cell will use the functions defined in the previous section to actually construct a huggingface dataset from labeled data and finetune the `layoutlm` model. Model finetuning will only be run if configured to do so, otherwise a pretrained version will be used from the `mlflow` tracking server.\n",
+ "\n",
+ "Model training contains several steps implemented below:\n",
+ "1. Use temporary path to convert filings to PDF's and stash labels\n",
+ "2. Use PDF's and labels to convert PDF's and labels to NER annotations\n",
+ "3. Construct huggingface dataset from NER annotations and split into train and test sets\n",
+ "4. Load pretrained model from huggingface\n",
+ "5. Finetune model on training data and evaluate on test data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "71d205b2-e6ea-4ad0-982c-22e762269119",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import mlflow\n",
+ "from datasets import (\n",
+ " Array2D,\n",
+ " Array3D,\n",
+ " Dataset,\n",
+ " Features,\n",
+ " Sequence,\n",
+ " Value,\n",
+ " load_metric,\n",
+ ")\n",
+ "from dotenv import load_dotenv\n",
+ "from transformers import (\n",
+ " AutoProcessor,\n",
+ " LayoutLMv3ForTokenClassification,\n",
+ " Trainer,\n",
+ " TrainingArguments,\n",
+ ")\n",
+ "from transformers.data.data_collator import default_data_collator\n",
+ "\n",
+ "from mozilla_sec_eia.library.mlflow import configure_mlflow\n",
+ "from mozilla_sec_eia.models.sec10k.ex_21.data.common import (\n",
+ " BBOX_COLS,\n",
+ " LABELS,\n",
+ " get_id_label_conversions,\n",
+ ")\n",
+ "\n",
+ "load_dotenv()\n",
+ "\n",
+ "\n",
+ "configure_mlflow()\n",
+ "mlflow.set_experiment(\"exhibit21_extraction_test\")\n",
+ "\n",
+ "\n",
+ "def _prepare_dataset(annotations, processor, label2id):\n",
+ " \"\"\"Put the dataset in its final format for training LayoutLM.\"\"\"\n",
+ "\n",
+ " def _convert_ner_tags_to_id(ner_tags, label2id):\n",
+ " return [int(label2id[ner_tag]) for ner_tag in ner_tags]\n",
+ "\n",
+ " images = annotations[\"image\"]\n",
+ " words = annotations[\"tokens\"]\n",
+ " boxes = annotations[\"bboxes\"]\n",
+ " # Map over labels and convert to numeric id for each ner_tag\n",
+ " ner_tags = [\n",
+ " _convert_ner_tags_to_id(ner_tags, label2id)\n",
+ " for ner_tags in annotations[\"ner_tags\"]\n",
+ " ]\n",
+ "\n",
+ " encoding = processor(\n",
+ " images,\n",
+ " words,\n",
+ " boxes=boxes,\n",
+ " word_labels=ner_tags,\n",
+ " truncation=True,\n",
+ " padding=\"max_length\",\n",
+ " )\n",
+ "\n",
+ " return encoding\n",
+ "\n",
+ "if (run_name := context.op_config[\"layoutlm_training_run\"]) is not None:\n",
+ " filter_string = f\"attributes.run_name = '{run_name}'\"\n",
+ " run = mlflow.search_runs(filter_string=filter_string, output_format=\"list\")[0]\n",
+ " training_run_id = run.info.run_id\n",
+ "else:\n",
+ " training_run_id = None\n",
+ "\n",
+ "# Only finetune if configured to do so\n",
+ "if training_run_id is None:\n",
+ " id2label, label2id = get_id_label_conversions(LABELS)\n",
+ " # Change temp_dir to save training data locally for inspection\n",
+ " # Cache/prepare training data\n",
+ " dataset = Dataset.from_list(ex21_training_data)\n",
+ "\n",
+ " # Load pretrained model\n",
+ " model = LayoutLMv3ForTokenClassification.from_pretrained(\n",
+ " \"microsoft/layoutlmv3-base\", id2label=id2label, label2id=label2id\n",
+ " )\n",
+ " processor = AutoProcessor.from_pretrained(\n",
+ " \"microsoft/layoutlmv3-base\", apply_ocr=False\n",
+ " )\n",
+ "\n",
+ " # Prepare our train & eval dataset\n",
+ " column_names = dataset.column_names\n",
+ " features = Features(\n",
+ " {\n",
+ " \"pixel_values\": Array3D(dtype=\"float32\", shape=(3, 224, 224)),\n",
+ " \"input_ids\": Sequence(feature=Value(dtype=\"int64\")),\n",
+ " \"attention_mask\": Sequence(Value(dtype=\"int64\")),\n",
+ " \"bbox\": Array2D(dtype=\"int64\", shape=(512, 4)),\n",
+ " \"labels\": Sequence(feature=Value(dtype=\"int64\")),\n",
+ " }\n",
+ " )\n",
+ " dataset = dataset.map(\n",
+ " lambda annotations: _prepare_dataset(annotations, processor, label2id),\n",
+ " batched=True,\n",
+ " remove_columns=column_names,\n",
+ " features=features,\n",
+ " )\n",
+ " dataset.set_format(\"torch\")\n",
+ " split_dataset = dataset.train_test_split(test_size=0.2)\n",
+ " train_dataset, eval_dataset = split_dataset[\"train\"], split_dataset[\"test\"]\n",
+ "\n",
+ " # Initialize our Trainer\n",
+ " metric = load_metric(\"seqeval\")\n",
+ " training_args = TrainingArguments(\n",
+ " max_steps=1000,\n",
+ " per_device_train_batch_size=1,\n",
+ " per_device_eval_batch_size=1,\n",
+ " learning_rate=1e-5,\n",
+ " evaluation_strategy=\"steps\",\n",
+ " eval_steps=100,\n",
+ " load_best_model_at_end=True,\n",
+ " metric_for_best_model=\"f1\",\n",
+ " output_dir=\"./layoutlm\",\n",
+ " )\n",
+ " trainer = Trainer(\n",
+ " model=model,\n",
+ " args=training_args,\n",
+ " train_dataset=train_dataset,\n",
+ " eval_dataset=eval_dataset,\n",
+ " tokenizer=processor,\n",
+ " data_collator=default_data_collator,\n",
+ " compute_metrics=lambda p: compute_metrics(p, metric=metric, label_list=LABELS),\n",
+ " )\n",
+ "\n",
+ " with mlflow.start_run() as training_run:\n",
+ " # Train inside mlflow run. Mlflow will automatically handle logging training metrcis\n",
+ " trainer.train()\n",
+ "\n",
+ " # Log finetuend model with mlflow\n",
+ " model = {\"model\": trainer.model, \"tokenizer\": trainer.tokenizer}\n",
+ " mlflow.transformers.log_model(\n",
+ " model, artifact_path=\"layoutlm_extractor\", task=\"token-classification\"\n",
+ " )\n",
+ " training_run_id = training_run.info. run_id"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ee9b4e20-7781-43a7-b7aa-caf0690a201e",
+ "metadata": {},
+ "source": [
+ "## Model inference\n",
+ "Use the finetuned model to perform inference and evaluate on labeled validation data. First create a Huggingface `Pipeline` which wraps layoutlm with some custom pre/post processing steps."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "42c8e920-d671-40c2-b5db-c43611a33897",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "from transformers import Pipeline, pipeline\n",
+ "from transformers.tokenization_utils_base import BatchEncoding\n",
+ "\n",
+ "from mozilla_sec_eia.models.sec10k.ex_21.data.common import (\n",
+ " get_flattened_mode_predictions,\n",
+ ")\n",
+ "from mozilla_sec_eia.models.sec10k.utils.layoutlm import (\n",
+ " iob_to_label,\n",
+ ")\n",
+ "\n",
+ "\n",
+ "def separate_entities_by_row(entity_df):\n",
+ " \"\"\"Separate entities that span multiple rows and should be distinct.\n",
+ "\n",
+ " Sometimes LayoutLM groups multiple entities that span multiple rows\n",
+ " into one entity. This function makes an attempt to break these out\n",
+ " into multiple entities, by taking the average distance between rows\n",
+ " and separating a grouped entity if the distance between y values\n",
+ " is greater than the third quantile of y value spacing.\n",
+ " \"\"\"\n",
+ " threshold = 1.0\n",
+ " entity_df.loc[:, \"line_group\"] = entity_df.loc[:, \"top_left_y\"].transform(\n",
+ " lambda y: (y // threshold).astype(int)\n",
+ " )\n",
+ " # Get the unique y-values for each line (group) per file\n",
+ " line_positions = (\n",
+ " entity_df.groupby([\"line_group\"])[\"top_left_y\"].mean().reset_index()\n",
+ " )\n",
+ " # Calculate the difference between adjacent y-values (i.e., distance between lines)\n",
+ " line_positions.loc[:, \"y_diff\"] = line_positions.loc[:, \"top_left_y\"].diff()\n",
+ " # Filter out NaN values and take the mean of the valid distances\n",
+ " y_diffs = line_positions[\"y_diff\"].dropna()\n",
+ " avg_y_diff = y_diffs.apply(np.floor).mean()\n",
+ " # if an I labeled entity is more than avg_y_diff from it's previoius box then make it a B entity\n",
+ " entity_df.loc[:, \"prev_y\"] = entity_df.loc[:, \"top_left_y\"].shift(1)\n",
+ " entity_df.loc[:, \"prev_iob\"] = entity_df.loc[:, \"iob_pred\"].shift(1)\n",
+ "\n",
+ " # If the current prediction is an I label\n",
+ " # and y distance exceeds the average y difference\n",
+ " # update to a B label and make it the start of a new entity\n",
+ " entity_df.loc[:, \"iob_pred\"] = np.where(\n",
+ " (entity_df[\"iob_pred\"].str[0] == \"I\")\n",
+ " & ((entity_df[\"top_left_y\"] - entity_df[\"prev_y\"]) >= avg_y_diff),\n",
+ " \"B\" + entity_df[\"iob_pred\"].str[1:], # Update to 'B'\n",
+ " entity_df[\"iob_pred\"], # Keep as is\n",
+ " )\n",
+ "\n",
+ " # Drop temporary columns\n",
+ " return entity_df.drop(columns=[\"prev_y\", \"prev_iob\"])\n",
+ "\n",
+ "class LayoutLMInferencePipeline(Pipeline):\n",
+ " \"\"\"Pipeline for performing inference with fine-tuned LayoutLM.\"\"\"\n",
+ "\n",
+ " def __init__(self, *args, **kwargs):\n",
+ " \"\"\"Initialize LayoutLMInferencePipeline.\"\"\"\n",
+ " super().__init__(*args, **kwargs)\n",
+ "\n",
+ " def _sanitize_parameters(self, **kwargs):\n",
+ " preprocess_kwargs = {}\n",
+ " if \"maybe_arg\" in kwargs:\n",
+ " preprocess_kwargs[\"maybe_arg\"] = kwargs[\"maybe_arg\"]\n",
+ " return preprocess_kwargs, {}, {}\n",
+ "\n",
+ " def preprocess(self, doc_dict):\n",
+ " \"\"\"Encode and tokenize model inputs.\"\"\"\n",
+ " image = doc_dict[\"image\"]\n",
+ " words = doc_dict[\"tokens\"]\n",
+ " boxes = doc_dict[\"bboxes\"]\n",
+ " encoding = self.tokenizer(\n",
+ " image,\n",
+ " words,\n",
+ " boxes=boxes,\n",
+ " return_tensors=\"pt\",\n",
+ " truncation=True,\n",
+ " padding=\"max_length\",\n",
+ " max_length=512, # this is the maximum max_length\n",
+ " stride=128,\n",
+ " return_offsets_mapping=True,\n",
+ " return_overflowing_tokens=True,\n",
+ " )\n",
+ " model_inputs = {}\n",
+ " model_inputs[\"raw_encoding\"] = encoding.copy()\n",
+ " model_inputs[\"doc_dict\"] = doc_dict\n",
+ " model_inputs[\"offset_mapping\"] = encoding.pop(\"offset_mapping\")\n",
+ " model_inputs[\"sample_mapping\"] = encoding.pop(\"overflow_to_sample_mapping\")\n",
+ " # TODO: do we actually need to make these into ints?\n",
+ " encoding[\"input_ids\"] = encoding[\"input_ids\"].to(torch.int64)\n",
+ " encoding[\"attention_mask\"] = encoding[\"attention_mask\"].to(torch.int64)\n",
+ " encoding[\"bbox\"] = encoding[\"bbox\"].to(torch.int64)\n",
+ " encoding[\"pixel_values\"] = torch.stack(encoding[\"pixel_values\"])\n",
+ " model_inputs[\"encoding\"] = encoding\n",
+ " return model_inputs\n",
+ "\n",
+ " def _forward(self, model_inputs):\n",
+ " # encoding is passed as a UserDict in the model_inputs dictionary\n",
+ " # turn it back into a BatchEncoding\n",
+ " encoding = BatchEncoding(model_inputs[\"encoding\"])\n",
+ " if torch.cuda.is_available():\n",
+ " encoding.to(\"cuda\")\n",
+ " self.model.to(\"cuda\")\n",
+ " # since we're doing inference, we don't need gradient computation\n",
+ " with torch.no_grad():\n",
+ " output = self.model(**encoding)\n",
+ " return {\n",
+ " \"logits\": output.logits,\n",
+ " \"predictions\": output.logits.argmax(-1).squeeze().tolist(),\n",
+ " \"raw_encoding\": model_inputs[\"raw_encoding\"],\n",
+ " \"doc_dict\": model_inputs[\"doc_dict\"],\n",
+ " }\n",
+ "\n",
+ " def postprocess(self, output_dict):\n",
+ " \"\"\"Return logits, model predictions, and the extracted dataframe.\"\"\"\n",
+ " output_df = self.extract_table(output_dict)\n",
+ " output_dict[\"output_df\"] = output_df\n",
+ " return output_dict\n",
+ "\n",
+ " def extract_table(self, output_dict):\n",
+ " \"\"\"Extract a structured table from a set of inference predictions.\n",
+ "\n",
+ " This function essentially works by stacking bounding boxes and predictions\n",
+ " into a dataframe and going from left to right and top to bottom. Then, every\n",
+ " every time a new subsidiary entity is encountered, it assigns a new group or\n",
+ " \"row\" to that subsidiary. Next, location and ownership percentage words/labeled\n",
+ " entities in between these subsidiary groups are assigned to a subsidiary row/group.\n",
+ " Finally, this is all formatted into a dataframe with an ID column from the original\n",
+ " filename and a basic cleaning function normalizes strings.\n",
+ " \"\"\"\n",
+ " # TODO: when model more mature, break this into sub functions to make it\n",
+ " # clearer what's going on\n",
+ " predictions = output_dict[\"predictions\"]\n",
+ " encoding = output_dict[\"raw_encoding\"]\n",
+ " doc_dict = output_dict[\"doc_dict\"]\n",
+ "\n",
+ " token_boxes_tensor = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1)\n",
+ " predictions_tensor = torch.tensor(predictions)\n",
+ " mode_predictions = get_flattened_mode_predictions(\n",
+ " token_boxes_tensor, predictions_tensor\n",
+ " )\n",
+ " token_boxes = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1).tolist()\n",
+ " predicted_labels = [\n",
+ " self.model.config.id2label[pred] for pred in mode_predictions\n",
+ " ]\n",
+ " simple_preds = [iob_to_label(pred).lower() for pred in predicted_labels]\n",
+ "\n",
+ " df = pd.DataFrame(data=token_boxes, columns=BBOX_COLS)\n",
+ " df.loc[:, \"iob_pred\"] = predicted_labels\n",
+ " df.loc[:, \"pred\"] = simple_preds\n",
+ " invalid_mask = (\n",
+ " (df[\"top_left_x\"] == 0)\n",
+ " & (df[\"top_left_y\"] == 0)\n",
+ " & (df[\"bottom_right_x\"] == 0)\n",
+ " & (df[\"bottom_right_y\"] == 0)\n",
+ " )\n",
+ " df = df[~invalid_mask]\n",
+ " # we want to get actual words on the dataframe, not just subwords that correspond to tokens\n",
+ " # subwords from the same word share the same bounding box coordinates\n",
+ " # so we merge the original words onto our dataframe on bbox coordinates\n",
+ " words_df = pd.DataFrame(data=doc_dict[\"bboxes\"], columns=BBOX_COLS)\n",
+ " words_df.loc[:, \"word\"] = doc_dict[\"tokens\"]\n",
+ " df = df.merge(words_df, how=\"left\", on=BBOX_COLS).drop_duplicates(\n",
+ " subset=BBOX_COLS + [\"pred\", \"word\"]\n",
+ " )\n",
+ " df = df.sort_values(by=[\"top_left_y\", \"top_left_x\"])\n",
+ " # rows that are the first occurrence in a new group (subsidiary, loc, own_per)\n",
+ " # should always have a B entity label. Manually override labels so this is true.\n",
+ " first_in_group_df = df[\n",
+ " (df[\"pred\"].ne(df[\"pred\"].shift())) & (df[\"pred\"] != \"other\")\n",
+ " ]\n",
+ " first_in_group_df.loc[:, \"iob_pred\"] = (\n",
+ " \"B\" + first_in_group_df[\"iob_pred\"].str[1:]\n",
+ " )\n",
+ " df.update(first_in_group_df)\n",
+ " # filter for just words that were labeled with non \"other\" entities\n",
+ " entities_df = df[df[\"pred\"] != \"other\"]\n",
+ " # boxes that have the same group label but are on different rows\n",
+ " # should be updated to have two different B labels\n",
+ "\n",
+ " entities_df = entities_df.groupby(\"pred\").apply(separate_entities_by_row, include_groups=False)\n",
+ " entities_df = entities_df.reset_index(\"pred\").sort_index()\n",
+ " # merge B and I entities to form one entity group\n",
+ " # (i.e. \"B-Subsidiary\" and \"I-Subsidiary\" become just \"subsidiary\"), assign a group ID\n",
+ " entities_df[\"group\"] = (entities_df[\"iob_pred\"].str.startswith(\"B-\")).cumsum()\n",
+ " grouped_df = (\n",
+ " entities_df.groupby([\"group\", \"pred\"])[\"word\"]\n",
+ " .apply(\" \".join)\n",
+ " .reset_index()[[\"pred\", \"word\"]]\n",
+ " )\n",
+ " # assign a new row every time there's a new subsidiary\n",
+ " grouped_df[\"row\"] = (grouped_df[\"pred\"].str.startswith(\"subsidiary\")).cumsum()\n",
+ " output_df = grouped_df.pivot_table(\n",
+ " index=\"row\", columns=\"pred\", values=\"word\", aggfunc=lambda x: \" \".join(x)\n",
+ " ).reset_index()\n",
+ " if output_df.empty:\n",
+ " return output_df\n",
+ " output_df.loc[:, \"id\"] = doc_dict[\"id\"]\n",
+ " return output_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ea9fe887-43ca-43e2-85e3-bf5371bd165f",
+ "metadata": {},
+ "source": [
+ "Next, wrap the `LayoutLMInferencePipeline` in an `mlflow` `pyfunc` model, which handles loading the pretrained model and managing inputs/outputs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "4d802e00-1ca4-40b3-b15b-561711a9db70",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "ename": "NameError",
+ "evalue": "name 'training_run_id' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmozilla_sec_eia\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msec10k\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mex_21\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mex21_validation_helpers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 8\u001b[0m clean_extracted_df,\n\u001b[1;32m 9\u001b[0m )\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# If a model was trained in this notebook, use it. Otherwise, use\u001b[39;00m\n\u001b[0;32m---> 12\u001b[0m model_uri \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mruns:/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mtraining_run_id\u001b[49m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/layoutlm_extractor\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 13\u001b[0m model_info \u001b[38;5;241m=\u001b[39m mlflow\u001b[38;5;241m.\u001b[39mmodels\u001b[38;5;241m.\u001b[39mget_model_info(model_uri)\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_get_data\u001b[39m(dataset):\n",
+ "\u001b[0;31mNameError\u001b[0m: name 'training_run_id' is not defined"
+ ]
+ }
+ ],
+ "source": [
+ "from PIL import Image\n",
+ "\n",
+ "from mozilla_sec_eia.models.sec10k.entities import (\n",
+ " Ex21CompanyOwnership,\n",
+ " Sec10kExtractionMetadata,\n",
+ ")\n",
+ "from mozilla_sec_eia.models.sec10k.ex_21.ex21_validation_helpers import (\n",
+ " clean_extracted_df,\n",
+ ")\n",
+ "\n",
+ "# If a model was trained in this notebook, use it. Otherwise, use\n",
+ "model_uri = f\"runs:/{training_run_id}/layoutlm_extractor\"\n",
+ "model_info = mlflow.models.get_model_info(model_uri)\n",
+ "\n",
+ "def _get_data(dataset):\n",
+ " yield from dataset\n",
+ "\n",
+ "def _fill_known_nulls(df):\n",
+ " \"\"\"Fill known nulls in location and own per column.\n",
+ "\n",
+ " Fill with known values from rows with same subsidiary.\n",
+ " If an extracted Ex. 21 table looks like the following:\n",
+ "\n",
+ " subsidiary loc own_per\n",
+ " Company A NaN NaN\n",
+ " Company A Delaware 50\n",
+ "\n",
+ " Then fill in the first row with location and ownership\n",
+ " percentage from the second row.\n",
+ " \"\"\"\n",
+ " if \"own_per\" in df:\n",
+ " df[\"own_per\"] = df.groupby([\"id\", \"subsidiary\"])[\"own_per\"].transform(\n",
+ " lambda group: group.ffill()\n",
+ " )\n",
+ " if \"loc\" in df:\n",
+ " df[\"loc\"] = df.groupby([\"id\", \"subsidiary\"])[\"loc\"].transform(\n",
+ " lambda group: group.ffill()\n",
+ " )\n",
+ " return df\n",
+ "\n",
+ "class Ex21Extractor(mlflow.pyfunc.PythonModel):\n",
+ " \"\"\"Create an mlflow pyfunc model to perform full EX21 extraction.\"\"\"\n",
+ " def load_context(self, context):\n",
+ " \"\"\"Load pretrained model.\"\"\"\n",
+ " os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
+ " self.model_components = mlflow.transformers.load_model(\n",
+ " context.artifacts[\"model_components\"], return_type=\"components\"\n",
+ " )\n",
+ "\n",
+ " def predict(self, context, model_input: pd.DataFrame, params=None):\n",
+ " \"\"\"Use pretrained model and inference pipeline to perform inference.\"\"\"\n",
+ " # Convert dataframe to pyarrow Dataset\n",
+ " model_input[\"image\"] = model_input.apply(\n",
+ " lambda row: Image.frombytes(\n",
+ " row[\"mode\"], (row[\"width\"], row[\"height\"]), row[\"image\"]\n",
+ " ),\n",
+ " axis=1,\n",
+ " )\n",
+ " dataset = Dataset.from_list(model_input.drop([\"mode\", \"width\", \"height\"], axis=1).to_dict(\"records\"))\n",
+ "\n",
+ " # TODO: figure out device argument\n",
+ " pipe = pipeline(\n",
+ " \"token-classification\",\n",
+ " model=self.model_components[\"model\"],\n",
+ " tokenizer=self.model_components[\"tokenizer\"],\n",
+ " pipeline_class=LayoutLMInferencePipeline,\n",
+ " device=torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\"),\n",
+ " )\n",
+ "\n",
+ " logits = []\n",
+ " predictions = []\n",
+ " all_output_df = Ex21CompanyOwnership.example(size=0)\n",
+ " extraction_metadata = Sec10kExtractionMetadata.example(size=0)\n",
+ " for output_dict in pipe(_get_data(dataset)):\n",
+ " logits.append(output_dict[\"logits\"])\n",
+ " predictions.append(output_dict[\"predictions\"])\n",
+ " output_df = output_dict[\"output_df\"]\n",
+ " if not output_df.empty:\n",
+ " filename = get_metadata_filename(output_df[\"id\"].iloc[0])\n",
+ " extraction_metadata.loc[filename, [\"success\"]] = True\n",
+ " all_output_df = pd.concat([all_output_df, output_df])\n",
+ " all_output_df.columns.name = None\n",
+ " all_output_df = clean_extracted_df(all_output_df)\n",
+ " all_output_df = _fill_known_nulls(all_output_df)\n",
+ " all_output_df = all_output_df[[\"id\", \"subsidiary\", \"loc\", \"own_per\"]].drop_duplicates()\n",
+ " all_output_df = all_output_df.reset_index(drop=True)\n",
+ " outputs_dict = {\n",
+ " \"all_output_df\": all_output_df,\n",
+ " \"logits\": logits,\n",
+ " \"predictions\": predictions,\n",
+ " }\n",
+ " return extraction_metadata, outputs_dict\n",
+ "\n",
+ "# Save model to local temp dir with artifacts, then reload for evaluation\n",
+ "with TemporaryDirectory() as tmp_dir:\n",
+ " mlflow.pyfunc.save_model(\n",
+ " path=tmp_dir,\n",
+ " python_model=Ex21Extractor(),\n",
+ " artifacts={\"model_components\": model_uri},\n",
+ " )\n",
+ " ex21_extraction_model = mlflow.pyfunc.load_model(tmp_dir)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fee84b13-6c37-4afe-8faa-003ff149aa2d",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Model Evaluation\n",
+ "Now the full extraction model can be evaluated using labeled validation data and logged to `mlflow`. The `mlflow` run used to evaluate and log the inference model will be created as a nested child run to the run used to train `layoutlm`. This setup allows multiple versions/configurations of inference to be associated with a single version of `layoutlm`, creating a clean organizational structure for testing the base model and inference logic separately."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1dee550f-7b06-4091-a65e-71c6b23a5bea",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "#### Validate model\n",
+ "Finally, run the full model on the validation set and log metrics to mlflow. The logged metrics/model will appear in a nested run below the training run used for the current version of the model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dfb56470-8527-424c-a9e5-4135e55fde4d",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2024/10/16 17:11:53 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+ " warnings.warn(\n",
+ "/tmp/ipykernel_48762/2514174394.py:29: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
+ " lambda group: group.ffill()\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_validation_set = pd.concat(\n",
+ "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+ " padded_compute_set = pd.concat(\n",
+ "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values `_ for more details.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "db36592620c244479123275dfc464648",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading artifacts: 0%| | 0/17 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2024/10/16 17:12:58 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n",
+ "2024/10/16 17:12:58 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev353+gdf5fe0d.d20241011) contains a local version label (+gdf5fe0d.d20241011). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev353' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n"
+ ]
+ }
+ ],
+ "source": [
+ "from mlflow.models import infer_signature\n",
+ "\n",
+ "from mozilla_sec_eia.models.sec10k.ex_21.ex21_validation_helpers import (\n",
+ " ex21_validation_metrics,\n",
+ ")\n",
+ "\n",
+ "with mlflow.start_run(parent_run_id=model_info.run_id, nested=True):\n",
+ " metadata, outputs_dict = ex21_extraction_model.predict(ex21_inference_dataset.copy())\n",
+ " extracted = outputs_dict[\"all_output_df\"]\n",
+ " metadata = pd.concat([ex21_failed_parsing_metadata, metadata])\n",
+ "\n",
+ " jaccard_df, prec_recall_df, incorrect_filenames, metrics = ex21_validation_metrics(extracted, ex21_validation_set)\n",
+ " mlflow.log_metrics(metrics)\n",
+ " mlflow.pyfunc.log_model(\n",
+ " \"exhibit21_extractor\",\n",
+ " python_model=Ex21Extractor(),\n",
+ " artifacts={\"model_components\": model_uri},\n",
+ " signature=infer_signature(ex21_inference_dataset, extracted), # NOTE: model returns a second dataframe with metadata, but mlflow only supports one in signature\n",
+ " )\n",
+ " mlflow.log_table(extracted, \"extracted_data.json\")\n",
+ " mlflow.log_table(metadata, \"extraction_metadata.json\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d11e2a7b-ec74-4930-b331-144a8584c72f",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
index 1a5ec96..f2e284a 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
@@ -29,6 +29,20 @@ def _compute_md5(file_path: Path) -> str:
return base64.b64encode(hash_md5.digest()).decode()
+def convert_ex21_id_to_filename(df: pd.DataFrame, id_col_name: str = "id"):
+ """Convert the ID column to GCS archive filenames.
+
+ The extracted Ex. 21 tables have an ID that doesn't match
+ the filenames in the GCS archive. Create a new column "filename"
+ that converts this ID column into the GCS archive filename
+ for that filing.
+ """
+ df.loc[:, "filename"] = (
+ "edgar/data/" + df[id_col_name].str.replace("-", "/", n=1) + ".txt"
+ )
+ return df
+
+
class Exhibit21(BaseModel):
"""This is a class to wrap Exhibit 21's, which are included in many SEC 10ks."""
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
new file mode 100644
index 0000000..3350449
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
@@ -0,0 +1,82 @@
+"""Implement record linkage model between SEC companies and EIA utilities."""
+
+from dagster import (
+ AssetKey,
+ AssetSpec,
+ Definitions,
+ StaticPartitionsDefinition,
+ load_assets_from_modules,
+)
+from dagstermill import (
+ ConfigurableLocalOutputNotebookIOManager,
+)
+from upath import UPath
+
+from mozilla_sec_eia.library import model_jobs
+from mozilla_sec_eia.library.generic_io_managers import (
+ PandasParquetIOManager,
+ PickleUPathIOManager,
+)
+from mozilla_sec_eia.library.mlflow import (
+ MlflowPyfuncModelIOManager,
+ mlflow_interface_resource,
+ mlflow_train_test_io_managers,
+)
+from mozilla_sec_eia.models.sec10k.utils.cloud import cloud_interface_resource
+
+from ..sec10k.extract import year_quarter_partitions
+from . import transform_eia_input, transform_sec_input
+
+eia_assets = load_assets_from_modules([transform_eia_input])
+sec_assets = load_assets_from_modules([transform_sec_input])
+
+eia_input_table_production_job = model_jobs.create_production_model_job(
+ "eia_input_table_creation", transform_eia_input.production_assets
+)
+sec_input_table_production_job = model_jobs.create_production_model_job(
+ "sec_input_table_creation", transform_sec_input.production_assets
+)
+
+# Create year_quarter partitions
+completed_partitions = StaticPartitionsDefinition(
+ [
+ year_quarter
+ for year_quarter in year_quarter_partitions.get_partition_keys()
+ if year_quarter
+ not in ["2018q1", "2018q2", "2019q1", "2020q1", "2021q1", "2022q1"]
+ ]
+)
+
+basic_10k_company_info = AssetSpec(
+ key=AssetKey("basic_10k_company_info"), partitions_def=completed_partitions
+).with_io_manager_key("pandas_parquet_io_manager")
+
+ex21_company_ownership_info = AssetSpec(
+ key=AssetKey("ex21_company_ownership_info"), partitions_def=completed_partitions
+).with_io_manager_key("pandas_parquet_io_manager")
+
+sec10k_filing_metadata = AssetSpec(
+ key=AssetKey("sec10k_filing_metadata"), partitions_def=completed_partitions
+).with_io_manager_key("io_manager")
+
+defs = Definitions(
+ sec_assets
+ + eia_assets
+ + [basic_10k_company_info, ex21_company_ownership_info, sec10k_filing_metadata],
+ jobs=[eia_input_table_production_job, sec_input_table_production_job],
+ resources={
+ "cloud_interface": cloud_interface_resource,
+ "mlflow_interface": mlflow_interface_resource,
+ "pandas_parquet_io_manager": PandasParquetIOManager(
+ base_path=UPath("gs://sec10k-outputs/v2")
+ ),
+ "pickle_gcs_io_manager": PickleUPathIOManager(
+ base_path=UPath("gs://sec10k-outputs/dagster_storage")
+ ),
+ "pyfunc_model_io_manager": MlflowPyfuncModelIOManager(
+ mlflow_interface=mlflow_interface_resource
+ ),
+ "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(),
+ }
+ | mlflow_train_test_io_managers,
+)
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py
new file mode 100644
index 0000000..c8ccfd9
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py
@@ -0,0 +1,56 @@
+"""Configuration file for the splink SEC to EIA record linkage model."""
+
+import splink.comparison_library as cl
+from splink import block_on
+
+STR_COLS = [
+ "company_name",
+ "street_address",
+ "street_address_2",
+ "city",
+ "state",
+ "zip_code",
+]
+
+SHARED_COLS = [
+ "record_id",
+ "report_date",
+ "report_year",
+ "company_name",
+ "company_name_no_legal",
+ "company_name_mphone",
+ "street_address",
+ "street_address_2",
+ "city",
+ "state", # could use state of incorporation from SEC
+ "zip_code",
+ "phone_number",
+]
+
+MATCH_COLS = ["company_name", "state", "city", "street_address"]
+
+BLOCKING_RULES = [
+ "substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)",
+ "l.street_address = r.street_address",
+ "substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and l.city = r.city",
+ # "substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and array_length(list_intersect(l.street_address_list, r.street_address_list)) >= 2",
+]
+
+company_name_comparison = cl.NameComparison(
+ "company_name_no_legal",
+ jaro_winkler_thresholds=[0.95],
+)
+
+address_comparison = cl.LevenshteinAtThresholds(
+ "street_address", distance_threshold_or_thresholds=[1]
+).configure(term_frequency_adjustments=True)
+
+state_comparison = cl.ExactMatch("state").configure(term_frequency_adjustments=True)
+city_comparison = cl.NameComparison("city", jaro_winkler_thresholds=[0.9])
+
+# blocking rules for estimating probability two random records match
+deterministic_blocking_rules = [
+ block_on("company_name_mphone", "company_name_mphone"),
+ "jaro_winkler_similarity(r.company_name, l.company_name) >= .95 and l.city = r.city",
+ "substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city and l.street_address = r.street_address",
+]
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py
new file mode 100644
index 0000000..c8f311c
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py
@@ -0,0 +1,117 @@
+"""Create an EIA input utilities table that's ready for record linkage with the SEC 10K companies."""
+
+import numpy as np
+import pandas as pd
+from dagster import AssetOut, asset
+
+from mozilla_sec_eia.library.record_linkage_utils import (
+ expand_street_name_abbreviations,
+ fill_street_address_nulls,
+ flatten_companies_across_time,
+ transform_company_name,
+)
+from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import STR_COLS
+
+EIA_COL_MAP = {
+ "utility_name_eia": "company_name", # TODO: should be linking to owner or operator name?
+ "address_2": "street_address_2",
+}
+
+
+# TODO: make Dagster inputs instead of reading from AWS?
+def harvest_eia861_utilities():
+ """Get the utilities contained in EIA Form 861.
+
+ TODO: In PUDL we should eventually implement an actual thorough
+ harvesting of utilities from all EIA Form 861 tables, but this is
+ good enough for now.
+ """
+ raw_eia861_df = pd.read_parquet(
+ "s3://pudl.catalyst.coop/stable/core_eia861__assn_utility.parquet"
+ )
+ harvested_df = pd.concat(
+ [
+ pd.read_parquet(
+ "s3://pudl.catalyst.coop/stable/core_eia861__yearly_utility_data_misc.parquet"
+ )[["report_date", "utility_id_eia", "utility_name_eia"]],
+ pd.read_parquet(
+ "s3://pudl.catalyst.coop/stable/core_eia861__yearly_operational_data_misc.parquet"
+ )[["report_date", "utility_id_eia", "utility_name_eia"]],
+ pd.read_parquet(
+ "s3://pudl.catalyst.coop/stable/core_eia861__yearly_demand_side_management_misc.parquet"
+ )[["report_date", "utility_id_eia", "utility_name_eia"]],
+ pd.read_parquet(
+ "s3://pudl.catalyst.coop/stable/core_eia861__yearly_energy_efficiency.parquet"
+ )[["report_date", "utility_id_eia", "utility_name_eia"]],
+ ]
+ )
+ eia861_df = raw_eia861_df.merge(
+ harvested_df, on=["report_date", "utility_id_eia"], how="left"
+ ).drop_duplicates(subset=["report_date", "utility_id_eia"])
+ mergers_df = pd.read_parquet(
+ "s3://pudl.catalyst.coop/stable/core_eia861__yearly_mergers.parquet"
+ )
+ mergers_df = mergers_df[mergers_df["new_parent"].notna()]
+ eia861_df = eia861_df.merge(
+ mergers_df[
+ ["report_date", "new_parent", "merge_address", "merge_city", "merge_state"]
+ ],
+ how="left",
+ left_on=["report_date", "utility_name_eia"],
+ right_on=["report_date", "new_parent"],
+ )
+ eia861_df = eia861_df.rename(
+ columns={"merge_address": "street_address", "merge_city": "city"}
+ )
+ eia861_df = (
+ eia861_df.groupby(["report_date", "utility_id_eia"]).first().reset_index()
+ )
+
+ eia861_df["state"] = eia861_df["state"].where(
+ eia861_df["merge_state"].isnull(), eia861_df["merge_state"]
+ )
+ eia861_df = eia861_df.drop(columns=["new_parent", "merge_state"])
+ return eia861_df
+
+
+@asset(
+ outs={
+ "core_eia__parents_and_subsidiaries": AssetOut(
+ io_manager_key="pandas_parquet_io_manager"
+ )
+ # TODO: allow year partitions?
+ }
+)
+# TODO: add Dagster asset inputs for PUDL inputs instead of reading from AWS?
+def eia_rl_input_table():
+ """Create a table of EIA Form 860 and 861 utilities."""
+ raw_eia_df = pd.read_parquet(
+ "s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet"
+ )
+ eia861_df = harvest_eia861_utilities()
+ eia_df = (
+ pd.concat([raw_eia_df, eia861_df])
+ .dropna(subset=["utility_name_eia"])
+ .rename(columns=EIA_COL_MAP)
+ .assign(
+ report_date=lambda df: df["report_date"].astype("datetime64[ns]"),
+ report_year=lambda df: df["report_date"].dt.year,
+ zip_code=lambda df: df["zip_code"].str[:5],
+ )
+ .pipe(transform_company_name)
+ .pipe(fill_street_address_nulls)
+ .pipe(lambda df: df.fillna(np.nan))
+ .reset_index(drop=True)
+ )
+ eia_df[STR_COLS] = eia_df[STR_COLS].apply(lambda x: x.str.strip().str.lower())
+ eia_df["street_address"] = expand_street_name_abbreviations(
+ eia_df["street_address"]
+ )
+ eia_df = flatten_companies_across_time(
+ df=eia_df, key_cols=["company_name", "street_address"]
+ ).reset_index(names="record_id")
+
+ return eia_df
+
+
+production_assets = [eia_rl_input_table]
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
new file mode 100644
index 0000000..666f010
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
@@ -0,0 +1,392 @@
+"""Module for creating an SEC 10K output table with filing companies and subsidiary companies."""
+
+import logging
+import re
+from importlib import resources
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from dagster import AssetIn, asset
+
+from mozilla_sec_eia.library.record_linkage_utils import (
+ expand_street_name_abbreviations,
+ fill_street_address_nulls,
+ flatten_companies_across_time,
+ transform_company_name,
+)
+from mozilla_sec_eia.models.sec10k.utils.cloud import (
+ convert_ex21_id_to_filename,
+)
+from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import STR_COLS
+
+logger = logging.getLogger(f"catalystcoop.{__name__}")
+
+
+EX21_COL_MAP = {"subsidiary": "company_name", "loc": "location_of_inc"}
+SEC_COL_MAP = {
+ "company_conformed_name": "company_name",
+ "street_1": "street_address",
+ "street_2": "street_address_2",
+ "zip": "zip_code",
+ "business_phone": "phone_number",
+}
+
+INVALID_NAMES = [
+ "llc",
+ "limited liability company",
+ "limited",
+ "ltd",
+ "iiii",
+ "inc",
+ "incorporated",
+ "partnership",
+ "i",
+ "name",
+ "company",
+ "&",
+ "",
+]
+
+
+def _remove_weird_sec_cols(sec_df) -> pd.DataFrame:
+ weird_cols = ["]fiscal_year_end", "]irs_number", "]state_of_incorporation"]
+ for weird_col in weird_cols:
+ if weird_col not in sec_df:
+ continue
+ normal_col = weird_col[1:]
+ sec_df.loc[:, normal_col] = sec_df[normal_col].where(
+ sec_df[weird_col].isnull(), sec_df[weird_col]
+ )
+ sec_df = sec_df.drop(columns=[weird_col])
+ return sec_df
+
+
+def _add_report_year_to_sec(sec_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFrame:
+ """Merge metadata on to get a report year for extracted SEC data.
+
+ Expects filename to be the index of the SEC dataframe.
+ """
+ sec_df = sec_df.merge(md[["filename", "date_filed"]], how="left", on=["filename"])
+ sec_df = sec_df.rename(columns={"date_filed": "report_date"})
+ sec_df.loc[:, "report_year"] = (
+ sec_df["report_date"].astype("datetime64[ns]").dt.year
+ )
+ return sec_df
+
+
+def get_sec_state_code_dict() -> dict[str, str]:
+ """Create a dictionary mapping state codes to their names.
+
+ Table found at https://www.sec.gov/submit-filings/filer-support-resources/edgar-state-country-codes
+ Published by SEC and reports valid state codes
+ for filers of Form D. Used to standardize the state codes
+ in the SEC 10K filings. The expanded names of the state codes
+ are comments in the XML file, so we have to read the XML in as
+ text and parse it.
+ """
+ # TODO: make a check to see if SEC has published a new version of this table
+ xml_filepath = (
+ resources.files("mozilla_sec_eia.package_data") / "formDStateCodes.xsd.xml"
+ )
+ with Path.open(xml_filepath) as file:
+ xml_text = file.read()
+
+ pattern = r'.*?'
+ state_code_dict = {
+ code.lower(): name.lower()
+ for code, name in re.findall(pattern, xml_text, re.DOTALL)
+ }
+ return state_code_dict
+
+
+def clean_location_of_inc(df) -> pd.DataFrame:
+ """Clean location of incorporation column in SEC basic 10K or Ex. 21 dataframe.
+
+ Arguments:
+ df: Ex. 21 or SEC 10K basic info dataframe with location_of_inc
+ column.
+ """
+ if "state_of_incorporation" in df:
+ df.loc[:, "location_of_inc"] = df["state_of_incorporation"]
+ state_code_to_name = get_sec_state_code_dict()
+ df.loc[:, "location_of_inc"] = (
+ df["location_of_inc"]
+ .replace(state_code_to_name)
+ .fillna(pd.NA)
+ .str.strip()
+ .str.lower()
+ .replace("", pd.NA)
+ )
+ return df
+
+
+def _add_parent_company_cik(ex21_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFrame:
+ """Add the CIK of the parent company to Ex. 21 subsidiaries."""
+ ex21_df = ex21_df.merge(md[["filename", "cik"]], how="left", on="filename").rename(
+ columns={"cik": "parent_company_cik"}
+ )
+ ex21_df.loc[:, "parent_company_cik"] = (
+ ex21_df["parent_company_cik"].astype(str).str.zfill(10)
+ )
+ return ex21_df
+
+
+def match_ex21_subsidiaries_to_filer_company(
+ basic10k_df: pd.DataFrame, ex21_df: pd.DataFrame
+) -> pd.DataFrame:
+ """Match Ex. 21 subsidiaries to filer companies.
+
+ We want to assign CIKs to Ex. 21 subsidiaries if they in turn
+ file a 10k. To do this, we merge the Ex. 21 subsidiaries to 10k
+ filers on comapny name. If there are multiple matches with the same
+ company name we take the company with the most overlap in location of
+ incorporation and nearest report years. Then we merge the CIK back onto
+ the Ex. 21 df.
+
+ Returns:
+ A dataframe of the Ex. 21 subsidiaries with a column for the
+ subsidiaries CIK (null if the subsidiary doesn't file).
+ """
+ basic10k_df = basic10k_df.drop_duplicates(
+ subset=[
+ "central_index_key",
+ "company_name",
+ "location_of_inc",
+ "report_year",
+ ]
+ )
+ merged_df = basic10k_df.merge(
+ ex21_df, how="inner", on="company_name", suffixes=("_sec", "_ex21")
+ )
+ # split up the location of incorporation on whitespace, creating a column
+ # with lists of word tokens
+ merged_df.loc[:, "loc_tokens_sec"] = (
+ merged_df["location_of_inc_sec"].fillna("").str.lower().str.split()
+ )
+ merged_df.loc[:, "loc_tokens_ex21"] = (
+ merged_df["location_of_inc_ex21"].fillna("").str.lower().str.split()
+ )
+ # get the number of words overlapping between location of incorporation tokens
+ merged_df["loc_overlap"] = merged_df.apply(
+ lambda row: len(set(row["loc_tokens_sec"]) & set(row["loc_tokens_ex21"])),
+ axis=1,
+ )
+ # get the difference in report years
+ merged_df["report_year_diff"] = merged_df.apply(
+ lambda row: abs(int(row["report_year_sec"]) - int(row["report_year_ex21"])),
+ axis=1,
+ )
+ merged_df = merged_df.sort_values(
+ by=[
+ "company_name",
+ "location_of_inc_ex21",
+ "loc_overlap",
+ "report_year_diff",
+ ],
+ ascending=[True, True, False, True],
+ )
+ # Select the row with the highest loc overlap and nearest report years
+ # for each company name, location, and parent company record
+ closest_match_df = merged_df.groupby(
+ ["company_name", "location_of_inc_ex21", "parent_company_cik"], as_index=False
+ ).first()
+ ex21_with_cik_df = ex21_df.merge(
+ closest_match_df[
+ [
+ "company_name",
+ "parent_company_cik",
+ "location_of_inc_ex21",
+ "central_index_key",
+ ]
+ ].rename(columns={"location_of_inc_ex21": "location_of_inc"}),
+ how="left",
+ on=["company_name", "location_of_inc", "parent_company_cik"],
+ ).rename(columns={"central_index_key": "subsidiary_cik"})
+ # if a subsidiary doesn't have a CIK and has a null location
+ # but its company name was assigned a CIK (with a different location)
+ # then assign that CIK to the subsidiary
+ ex21_with_cik_df = ex21_with_cik_df.merge(
+ closest_match_df[["company_name", "central_index_key"]],
+ how="left",
+ on="company_name",
+ ).rename(columns={"central_index_key": "company_name_merge_cik"})
+ ex21_with_cik_df["subsidiary_cik"] = ex21_with_cik_df["subsidiary_cik"].where(
+ ~(ex21_with_cik_df.subsidiary_cik.isnull())
+ | ~(ex21_with_cik_df.location_of_inc.isnull()),
+ ex21_with_cik_df["company_name_merge_cik"],
+ )
+ ex21_with_cik_df = ex21_with_cik_df.drop(columns="company_name_merge_cik")
+ ex21_with_cik_df = ex21_with_cik_df.rename(
+ columns={"subsidiary_cik": "central_index_key"}
+ )
+ ex21_with_cik_df = ex21_with_cik_df.drop_duplicates()
+
+ return ex21_with_cik_df
+
+
+def create_sec_company_id_for_ex21_subs(ex21_df: pd.DataFrame) -> pd.DataFrame:
+ """Create an sec_company_id for Ex. 21 subsidiaries.
+
+ This is a unique identifier string for Ex. 21 subsidiaries.
+ This ID is necessary for tracking subsidiaries who aren't ultimately
+ matched to a 10K filer company.
+ """
+ ex21_df.loc[:, "sec_company_id"] = (
+ ex21_df["parent_company_cik"]
+ + "_"
+ + ex21_df["company_name"]
+ + "_"
+ + ex21_df["location_of_inc"]
+ )
+ return ex21_df
+
+
+@asset(
+ ins={
+ "ex21_dfs": AssetIn("ex21_company_ownership_info"),
+ "sec10k_filing_metadata_dfs": AssetIn("sec10k_filing_metadata"),
+ },
+)
+def transformed_ex21_subsidiary_table(
+ ex21_dfs: dict[str, pd.DataFrame],
+ sec10k_filing_metadata_dfs: dict[str, pd.DataFrame],
+) -> pd.DataFrame:
+ """Transform Ex. 21 table of subsidiaries before combining with basic 10k table."""
+ ex21_df = pd.concat(ex21_dfs.values())
+ sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values())
+
+ ex21_df.loc[:, "filename"] = convert_ex21_id_to_filename(ex21_df)
+ ex21_df = ex21_df.drop(columns=["id"])
+ ex21_df = _add_report_year_to_sec(ex21_df, sec10k_filing_metadata)
+ ex21_df = ex21_df.rename(columns=EX21_COL_MAP)
+ ex21_df = clean_location_of_inc(ex21_df)
+ ex21_df = transform_company_name(ex21_df)
+ ex21_df = _add_parent_company_cik(ex21_df, sec10k_filing_metadata)
+ # add an sec_company_id, ultimately this ID become the subsidiary's CIK
+ # if the subsidiary is matched to an SEC filer
+ ex21_df = create_sec_company_id_for_ex21_subs(ex21_df=ex21_df)
+ ex21_df = flatten_companies_across_time(
+ df=ex21_df, key_cols=["sec_company_id"], date_col="report_year"
+ )
+ ex21_df = ex21_df.fillna(np.nan)
+
+ return ex21_df
+
+
+def transform_basic10k_table(
+ basic_10k_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame
+) -> pd.DataFrame:
+ """Transformations on SEC basic 10K filer table to prepare for record linkage."""
+ basic_10k_df = basic_10k_df.reset_index().pivot_table(
+ values="value", index="filename", columns="key", aggfunc="first"
+ )
+ basic_10k_df.columns.name = None
+ basic_10k_df = (
+ basic_10k_df.reset_index()
+ .pipe(_remove_weird_sec_cols)
+ .pipe(_add_report_year_to_sec, sec10k_filing_metadata)
+ .rename(columns=SEC_COL_MAP)
+ .pipe(clean_location_of_inc)
+ .pipe(transform_company_name)
+ .assign(
+ zip_code=lambda df: df["zip_code"].str[:5],
+ files_10k=True,
+ sec_company_id=lambda df: df["central_index_key"],
+ )
+ .pipe(fill_street_address_nulls)
+ )
+ basic_10k_df[STR_COLS] = basic_10k_df[STR_COLS].apply(
+ lambda x: x.str.strip().str.lower()
+ )
+ basic_10k_df["street_address"] = expand_street_name_abbreviations(
+ basic_10k_df["street_address"]
+ )
+ # flatten across time on unique company name and address pair
+ basic_10k_df = flatten_companies_across_time(
+ df=basic_10k_df, key_cols=["company_name", "street_address"]
+ )
+
+ return basic_10k_df
+
+
+@asset(
+ ins={
+ "basic_10k_dfs": AssetIn("basic_10k_company_info"),
+ "sec10k_filing_metadata_dfs": AssetIn("sec10k_filing_metadata"),
+ },
+)
+def core_sec_10k__filers(
+ basic_10k_dfs: dict[str, pd.DataFrame],
+ sec10k_filing_metadata_dfs: dict[str, pd.DataFrame],
+) -> pd.DataFrame:
+ """Asset for creating a cleaned basic 10k table with EIA utility matched.
+
+ Flatten the table across time to only keep the most recent record
+ for each unique company name and address pair. Clean table and link filers
+ to EIA utilities.
+ """
+ basic_10k_df = pd.concat(basic_10k_dfs.values())
+ sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values())
+ basic_10k_df = transform_basic10k_table(basic_10k_df, sec10k_filing_metadata)
+ out_df = basic_10k_df.fillna(np.nan).reset_index(names="record_id")
+ # match EIA utilities to filers
+ # TODO: Here we conduct the match to EIA and add on a column with utility_id_eia
+ return out_df
+
+
+@asset(
+ ins={
+ "sec_10k_filers_matched_df": AssetIn("core_sec_10k__filers"),
+ "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"),
+ "clean_eia_df": AssetIn("core_eia__parents_and_subsidiaries"),
+ },
+)
+def out_sec_10k__parents_and_subsidiaries(
+ sec_10k_filers_matched_df: pd.DataFrame,
+ clean_ex21_df: pd.DataFrame,
+ clean_eia_df: pd.DataFrame,
+) -> pd.DataFrame:
+ """Asset for creating an SEC 10K output table.
+
+ Add in Ex. 21 subsidiaries and link them to already present
+ filing companies. Create an sec_company_id for subsidiaries
+ that aren't linked to a CIK.
+ """
+ ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company(
+ basic10k_df=sec_10k_filers_matched_df, ex21_df=clean_ex21_df
+ )
+ sec_10k_filers_matched_df = sec_10k_filers_matched_df.merge(
+ ex21_df_with_cik[["central_index_key", "parent_company_cik", "own_per"]],
+ how="left",
+ on="central_index_key",
+ )
+ # get the subsidiary companies that weren't matched to a 10K filing company
+ ex21_non_filing_subs_df = ex21_df_with_cik[
+ ex21_df_with_cik["central_index_key"].isnull()
+ ]
+ ex21_non_filing_subs_df.loc[:, "files_10k"] = False
+ # the last step is to take the EIA utilities that haven't been matched
+ # to a filer company, and merge them by company name onto the Ex. 21 subs
+ unmatched_eia_df = clean_eia_df[
+ ~clean_eia_df["utility_id_eia"].isin(
+ sec_10k_filers_matched_df.utility_id_eia.unique()
+ )
+ ].drop_duplicates(subset="company_name")
+ ex21_non_filing_subs_df = ex21_non_filing_subs_df.merge(
+ unmatched_eia_df[["utility_id_eia", "company_name"]],
+ how="left",
+ on="company_name",
+ ).drop_duplicates(subset="sec_company_id")
+ logger.info(
+ f"Ex. 21 subsidiary names matched to an EIA utility name: {len(ex21_non_filing_subs_df["utility_id_eia"].unique())}"
+ )
+ out_df = pd.concat([sec_10k_filers_matched_df, ex21_non_filing_subs_df])
+ return out_df
+
+
+production_assets = [
+ core_sec_10k__filers,
+ transformed_ex21_subsidiary_table,
+ out_sec_10k__parents_and_subsidiaries,
+]
diff --git a/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml b/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml
new file mode 100644
index 0000000..2ec0c2b
--- /dev/null
+++ b/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml
@@ -0,0 +1,328 @@
+
+
+
+
+
+
+
+
+
+
+
+ Set of valid State and Country Codes according to EDGAR.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/mozilla_sec_eia/package_data/street_suffix_abbreviations.json b/src/mozilla_sec_eia/package_data/street_suffix_abbreviations.json
new file mode 100644
index 0000000..e305113
--- /dev/null
+++ b/src/mozilla_sec_eia/package_data/street_suffix_abbreviations.json
@@ -0,0 +1,203 @@
+{
+ "aly": ["alley", "allee", "ally"],
+ "anx": ["anex", "annex", "annx"],
+ "arc": ["arcade"],
+ "ave": ["avenue", "av", "aven", "avenu", "avn", "avnue"],
+ "byu": ["bayou", "bayoo"],
+ "bch": ["beach"],
+ "bnd": ["bend"],
+ "blf": ["bluff", "bluf"],
+ "blfs": ["bluffs"],
+ "btm": ["bottom", "bot", "bottm"],
+ "blvd": ["boulevard", "boul", "boulv"],
+ "br": ["branch", "brnch"],
+ "brg": ["bridge", "brdge"],
+ "brk": ["brook"],
+ "brks": ["brooks"],
+ "bg": ["burg"],
+ "bgs": ["burgs"],
+ "byp": ["bypass", "bypa", "bypas", "byps"],
+ "cp": ["camp", "cmp"],
+ "cyn": ["canyon", "canyn", "cnyn"],
+ "cpe": ["cape"],
+ "cswy": ["causeway", "causwa"],
+ "ctr": ["center", "cen", "cent", "centr", "centre", "cnter", "cntr"],
+ "ctrs": ["centers"],
+ "cir": ["circle", "circ", "circl", "crcl", "crcle"],
+ "cirs": ["circles"],
+ "clf": ["cliff"],
+ "clfs": ["cliffs"],
+ "clb": ["club"],
+ "cmn": ["common"],
+ "cmns": ["commons"],
+ "cor": ["corner"],
+ "cors": ["corners"],
+ "crse": ["course"],
+ "ct": ["court"],
+ "cts": ["courts"],
+ "cv": ["cove"],
+ "cvs": ["coves"],
+ "crk": ["creek"],
+ "cres": ["crescent", "crsent", "crsnt"],
+ "crst": ["crest"],
+ "xing": ["crossing", "crssng"],
+ "xrd": ["crossroad"],
+ "xrds": ["crossroads"],
+ "curv": ["curve"],
+ "dl": ["dale"],
+ "dm": ["dam"],
+ "dv": ["divide", "div", "dvd"],
+ "dr": ["drive", "driv", "drv"],
+ "drs": ["drives"],
+ "est": ["estate"],
+ "ests": ["estates"],
+ "expy": ["expressway", "exp", "expr", "express", "expw"],
+ "ext": ["extension", "extn", "extnsn"],
+ "exts": ["extensions"],
+ "fls": ["falls"],
+ "fry": ["ferry", "frry"],
+ "fld": ["field"],
+ "flds": ["fields"],
+ "flt": ["flat"],
+ "flts": ["flats"],
+ "frd": ["ford"],
+ "frds": ["fords"],
+ "frst": ["forest", "forests"],
+ "frg": ["forge", "forg"],
+ "frgs": ["forges"],
+ "frk": ["fork"],
+ "frks": ["forks"],
+ "ft": ["fort", "frt"],
+ "fwy": ["freeway", "freewy", "frway", "frwy"],
+ "gdn": ["garden", "gardn", "grden", "grdn"],
+ "gdns": ["gardens", "grdns"],
+ "gtwy": ["gateway", "gatewy", "gatway", "gtway"],
+ "gln": ["glen"],
+ "glns": ["glens"],
+ "grn": ["green"],
+ "grns": ["greens"],
+ "grv": ["grove", "grov"],
+ "grvs": ["groves"],
+ "hbr": ["harbor", "harb", "harbr", "hrbor"],
+ "hbrs": ["harbors"],
+ "hvn": ["haven"],
+ "hts": ["heights", "ht"],
+ "hwy": ["highway", "highwy", "hiway", "hiwy", "hway"],
+ "hl": ["hill"],
+ "hls": ["hills"],
+ "holw": ["hollow", "hllw", "hollows", "holws"],
+ "inlt": ["inlet"],
+ "is": ["island", "islnd"],
+ "iss": ["islands", "islnds"],
+ "isle": ["isles"],
+ "jct": ["junction", "jction", "jctn", "junctn", "juncton"],
+ "jcts": ["junctions", "jctns"],
+ "ky": ["key"],
+ "kys": ["keys"],
+ "knl": ["knoll", "knol"],
+ "knls": ["knolls"],
+ "lk": ["lake"],
+ "lks": ["lakes"],
+ "land": ["land"],
+ "lndg": ["landing", "lndng"],
+ "ln": ["lane"],
+ "lgt": ["light"],
+ "lgts": ["lights"],
+ "lf": ["loaf"],
+ "lck": ["lock"],
+ "lcks": ["locks"],
+ "ldg": ["lodge", "ldge", "lodg"],
+ "loop": ["loops"],
+ "mall": ["mall"],
+ "mnr": ["manor"],
+ "mnrs": ["manors"],
+ "mdw": ["meadow"],
+ "mdws": ["meadows", "mdw", "medows"],
+ "mews": ["mews"],
+ "ml": ["mill"],
+ "mls": ["mills"],
+ "msn": ["mission", "missn", "mssn"],
+ "mtwy": ["motorway"],
+ "mt": ["mount", "mnt"],
+ "mtn": ["mountain", "mntain", "mntn", "mountin", "mtin"],
+ "mtns": ["mountains", "mntns"],
+ "nck": ["neck"],
+ "orch": ["orchard", "orchrd"],
+ "oval": ["ovl"],
+ "opas": ["overpass"],
+ "park": ["parks"],
+ "pkwy": ["parkway", "parkwy", "pkway", "pky", "parkways", "pkwys"],
+ "pass": ["pass"],
+ "psge": ["passage"],
+ "path": ["paths"],
+ "pike": ["pikes"],
+ "pne": ["pine"],
+ "pnes": ["pines"],
+ "pl": ["place"],
+ "pln": ["plain"],
+ "plns": ["plains"],
+ "plz": ["plaza", "plza"],
+ "pt": ["point"],
+ "pts": ["points"],
+ "prt": ["port"],
+ "prts": ["ports"],
+ "pr": ["prairie", "prr"],
+ "radl": ["radial", "rad", "radiel"],
+ "ramp": ["ramp"],
+ "rnch": ["ranch", "ranches", "rnchs"],
+ "rpd": ["rapid"],
+ "rpds": ["rapids"],
+ "rst": ["rest"],
+ "rdg": ["ridge", "rdge"],
+ "rdgs": ["ridges"],
+ "riv": ["river", "rvr", "rivr"],
+ "rd": ["road"],
+ "rds": ["roads"],
+ "rte": ["route"],
+ "row": ["row"],
+ "rue": ["rue"],
+ "run": ["run"],
+ "shl": ["shoal"],
+ "shls": ["shoals"],
+ "shr": ["shore", "shoar"],
+ "shrs": ["shores", "shoars"],
+ "skwy": ["skyway"],
+ "spg": ["spring", "spng", "sprng"],
+ "spgs": ["springs", "spngs", "sprngs"],
+ "spur": ["spurs"],
+ "sq": ["square", "sqr", "sqre", "squ"],
+ "sqs": ["squares", "sqrs"],
+ "sta": ["station", "statn", "stn"],
+ "stra": ["stravenue", "strav", "straven", "stravn", "strvn", "strvnue"],
+ "strm": ["stream", "streme"],
+ "st": ["street", "strt", "str"],
+ "sts": ["streets"],
+ "smt": ["summit", "sumit", "sumitt"],
+ "ter": ["terrace", "terr"],
+ "trwy": ["throughway"],
+ "trce": ["trace", "traces"],
+ "trak": ["track", "tracks", "trk", "trks"],
+ "trfy": ["trafficway"],
+ "trl": ["trail", "trails", "trls"],
+ "trlr": ["trailer", "trlrs"],
+ "tunl": ["tunnel", "tunel", "tunls", "tunnels", "tunnl"],
+ "tpke": ["turnpike", "trnpk", "turnpk"],
+ "upas": ["underpass"],
+ "un": ["union"],
+ "uns": ["unions"],
+ "vly": ["valley", "vally", "vlly"],
+ "vlys": ["valleys"],
+ "via": ["viaduct", "vdct", "viadct"],
+ "vw": ["view"],
+ "vws": ["views"],
+ "vlg": ["village", "vill", "villag", "villg", "villiage"],
+ "vlgs": ["villages"],
+ "vl": ["ville"],
+ "vis": ["vista", "vist", "vst", "vsta"],
+ "walk": ["walks"],
+ "wall": ["wall"],
+ "way": ["wy"],
+ "ways": ["ways"],
+ "wl": ["well"],
+ "wls": ["wells"]
+}
diff --git a/test_environment.yml b/test_environment.yml
index 5fa9b2d..f54968d 100644
--- a/test_environment.yml
+++ b/test_environment.yml
@@ -6,7 +6,7 @@ channels:
dependencies:
# Packages required for setting up the environment
- pip>=21,<24
- - python>=3.10,<3.12
+ - python>=3.10,<=3.12
- setuptools>=66,<69
# Packages specified in setup.py that need or benefit from binary conda packages
@@ -29,6 +29,10 @@ dependencies:
- pytorch>=2.2,<3
- torchvision
+ # GDAL is a transitive dependency whose binaries must match those installed by the
+ # pudl-dev conda environment, so we also install it with conda here.
+ - gdal==3.9.3 # pinned to ensure it matches pudl-dev environment exactly.
+
# Use pip to install the package defined by this repo for development:
- pip:
- --editable ./[dev,docs,tests,types]
diff --git a/workspace.yaml b/workspace.yaml
index 144aada..a208373 100644
--- a/workspace.yaml
+++ b/workspace.yaml
@@ -1,2 +1,3 @@
load_from:
- python_module: mozilla_sec_eia.models.sec10k
+ - python_module: mozilla_sec_eia.models.sec_eia_record_linkage